In [10]:
import pandas as pd

In [11]:
class DataExtractor:
    
    def __init__(self, invoices_file, expired_invoices_file):
        self.invoices_file = invoices_file
        self.expired_invoices_file = expired_invoices_file
        
    def convert_to_int(self, data):
        number_words = {"one": 1,
                        "two": 2,
                        "three": 3,
                        "four": 4,
                        "five": 5,
                        "six": 6,
                        "seven": 7,
                        "eight": 8,
                        "nine": 9,
                        "ten": 10,
                        "eleven": 11,
                        "twelve": 12,
                        "thirteen": 13,
                        "fourteen": 14,
                        "fifteen": 15,
                        "sixteen": 16,
                        "seventeen": 17,
                        "eighteen": 18,
                        "nineteen": 19,
                        "twenty": 20}
        
        if isinstance(data, str):
            if data in number_words.keys():
                data = number_words[data]
            else:
                data = data.replace('O', '0')
        data = int(data)

        return data
    
    
    def extract_data(self):
        invoices = pd.read_pickle(self.invoices_file)
        with open(self.expired_invoices_file, 'r') as file:
            ids = file.read()
            expired_invoice_ids = ids.replace(',', ' ').split()
        
        data = []
        conversion_table = {0: 'Material', 
                            1: 'Equipment', 
                            2: 'Service', 
                            3: 'Other'}

        for invoice in invoices:    
            invoice_id = self.convert_to_int(invoice['id'])
            created_on = pd.to_datetime(invoice['created_on'], errors='coerce')

            invoice_total = sum(self.convert_to_int(item['item']['unit_price']) * self.convert_to_int(item['quantity']) for item in invoice.get('items', []))

            items = invoice.get('items', '')
            if items == '':
                data.append({
                    'invoice_id': invoice_id,
                    'created_on': created_on,
                    'invoiceitem_id': None,
                    'invoiceitem_name': None,
                    'type': None,
                    'unit_price': None,
                    'total_price': None,
                    'percentage_in_invoice': None,
                    'is_expired': invoice_id in expired_invoice_ids
                })

            else:
                for item in items:
                    invoiceitem_id = int(item['item']['id'])
                    invoiceitem_name = str(item['item']['name'])
                    invoiceitem_type = self.convert_to_int(item['item']['type'])
                    invoiceitem_type = conversion_table[invoiceitem_type]
                    unit_price = self.convert_to_int(item['item']['unit_price'])
                    quantity = self.convert_to_int(item['quantity'])
                    total_price = int(quantity * unit_price)
                    percentage_in_invoice = float(total_price / invoice_total)
                    is_expired = invoice_id in expired_invoice_ids

                    data.append({
                        'invoice_id': invoice_id,
                        'created_on': created_on,
                        'invoiceitem_id': invoiceitem_id,
                        'invoiceitem_name': invoiceitem_name,
                        'type': invoiceitem_type,
                        'unit_price': unit_price,
                        'total_price': total_price,
                        'percentage_in_invoice': percentage_in_invoice,
                        'is_expired': is_expired
                    })

        df = pd.DataFrame(data)
        df = df.sort_values(by=['invoice_id', 'invoiceitem_id'], ascending=True).reset_index(drop=True)
        return df

In [12]:
# Example usage:
if __name__ == "__main__":
    extractor = DataExtractor("data/invoices_new.pkl", "data/expired_invoices.txt")
    df = extractor.extract_data()
    df.to_csv("extracted_data.csv", index=False)