In [None]:
import pandas as pd
import numpy as np
import tabula
import math
import os
from datetime import date

In [None]:
class VacsTable:
    """
    VacsTable class defines a vacants table and methods to clean that table
    """
    table_types = {'vacants': [        #list of vacants tables admitted. 
                            'LD-PA common',   #ld-pa vacants usually generate (2, 14) DFs (need to separate?)
                            'CM common',      #ld-pa vacants usually generate (2, 15) DFs
                            '2-16 shape',
                            'single',         #tables with a SINGLE VACANT generate different table structures
                            'otan'],
                  'posts': [np.nan]
                 }
    
    def __init__(self, df, filename, count):
        self.data = df
        self.file = filename
        self.index = count
        self.type = 'unknown'   #property for type of table
        self.vac_qty = 0        #property for number of vacants in the table
        self.__get_type()         #call get_type() method at instantiation moment
        self.__get_vac_qty()      #call get_vac_qty() method at instantiation moment

        
#Method to get vacants TYPE
    def __get_type(self):      #method to determine the type of vacants table at hand
        if df.shape == (2, 14):
            self.type = 'LD-PA common'        #ld(30mar22)-pa(01abr22) vacants usually generate (2, 14) DFs
            assert self.type in self.types
            
        elif df.shape == (2, 15):
            self.type = 'CM common'           #CM(31mar22) vacants usually generate (2, 15) DFs
            assert self.type in self.types

        elif df.shape == (2, 16):
            self.type = '2-16 common'        #xxx vacants usually generate (2, 16) DFs
            assert self.type in self.types

        elif _____:
            self.type = 'single'        #07MAR22. 16 columnas. Tienen "código OTAN" en el encabezado.
            assert self.type in self.types 
            
        elif _____:
            self.type = 'otan'        #07MAR22. 16 columnas. Tienen "código OTAN" en el encabezado.
            assert self.type in self.types            
            
        else:
            pass
            #03mar22. POSTS in international organizations
            #03mar22. VACANTS "Cuerpos Comunes"
            #03mar22. POSTS LD (Agustín)
            #03mar22. POSTS CM (Forzoso - Capote)
            #03mar22. VACANTS "Armada"
            #03mar22. VACANTS "EA"
            #17mar22. POSTS "OC"
        
    def __get_vac_qty(self):       #method to extract number of vacants in the table
        pass

    
#Block of methods to extract VACANTS
    def extract_vacs(self):        #method to extract vacants. Calls the appropriate private method
        if self.type == t1:
            self.__extract_t1()

        elif self.type == t2:
            self.__extract_t2()
            
        else: pass
    
    def __extract_t1(self):        #method to extract vacants in the type 1 table
        pass
    
    def __extract_t2(self):        #method to extract vacants in the type x table
        pass
    
    def __extract_tx(self):        #method to extract vacants in the type x table
        pass
    
    
#Block of methods to extract POSTS
    def extract_posts(self):        #method to extract vacants. Calls the appropriate private method
        if self.type == t99:
            self.__extract_t99()

        elif self.type == t98:
            self.__extract_t98()
            
        else: pass
    
    def __extract_t99(self):        #method to extract vacants in the type 1 table
        pass
    
    def __extract_t98(self):        #method to extract vacants in the type x table
        pass
    
    def __extract_txx(self):        #method to extract vacants in the type x table
        pass
    

#Method to analyze unprocessed tables

    def to_unprocessed(self):
        info = {
            'file': self.file,
            'df_index': self.index,
            'shape': self.data.shape,
            'vac_qty': self.vac_qty,
            'data': self.data
        }
        
        data = pd.DataFrame(info)
        discarded_tables.append(data)         #requires discarded_tables blank list of DFs instantiated outside the loop in main script.

In [None]:
#este bloque busca los BOD en PDF en la carpeta de documentos (docs_dir), los compara con los documentos que ya se procesaron en su día guardados como .pkl (pkld_list) y genera una lista
#de nombres de archivos a procesar (proc_docs)

docs_dir = 'D:/jaume/Datasets/BOD/'
pkld_dir = 'D:/jaume/Jupyter Notebooks/Vacantes Scraper/ScrapedData/'

docs_list = []    #lista de documentos en la carpeta de entrada
pkld_list = []    #lista de documentos ya procesados y transformados a pkl
proc_docs = []    #lista de documentos aún por procesar

# iterate over files in that directory
for filename in os.scandir(docs_dir):
    if filename.is_file():
        docs_list.append(filename.name.split('_')[1])
        
# iterate over files in that directory
for filename in os.scandir(pkld_dir):
    if filename.is_file():
        pkld_list.append(filename.name.split('.')[0])

#seriales de documentos a procesar
pending_list = [doc for doc in docs_list if doc not in pkld_list]

# list of files to porcess
for filename in os.scandir(docs_dir):
    if filename.name.split('_')[1] in pending_list:
        proc_docs.append(filename)

In [None]:
discarded_tables = []

for doc in proc_docs:
    
    print('Processing ' + doc.name + '...')
    file_pdf = docs_dir + doc.name
    read_pdf = tabula.read_pdf(file_pdf, pages = 'all', silent = True)

    for count, df in enumerate(read_pdf):
        table = VacsTable(df, doc.name, count)
        if table.type in table_types['vacants']:
            table.extract_vacs()

        elif table.type in table_types['posts']:
            table.extract_posts()

        else:
            assert table.type == 'unknown'
            table.to_unprocessed()              #method to analyze unprocessed dataframes