<a href="https://colab.research.google.com/github/jtneumann/DN_LP/blob/master/DN2_week2_trial2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook provides recipes for loading and saving data from external sources.

# Local file system

## Uploading files from your local file system

`files.upload` returns a dictionary of the files which were uploaded.
The dictionary is keyed by the file name and values are the data which were uploaded.

In [0]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving 211559-050.pdf to 211559-050.pdf
Saving 1900070.pdf to 1900070.pdf
Saving pdf1.ini to pdf1.ini
Saving pdf2.ini to pdf2.ini
User uploaded file "211559-050.pdf" with length 3387068 bytes
User uploaded file "1900070.pdf" with length 136238 bytes
User uploaded file "pdf1.ini" with length 284 bytes
User uploaded file "pdf2.ini" with length 247 bytes


In [0]:
!pip install PyPDF2 pdfplumber
import pandas as pd
import configparser
import PyPDF2, pdfplumber
import os



In [0]:
def readini(fname):
    parser = configparser.ConfigParser()
    parser.read(fname)
    fieldsToRead = {section: dict(parser.items(section)) for section in parser.sections()}
    return fieldsToRead

In [0]:
Reading = readini('pdf1 (3).ini')

In [0]:
Reading

{'row1': {'amt1|e': 'Amount',
  'desc1|e': 'Description',
  'pr1|e': 'Price',
  'qty1|e': 'Quantity',
  'ref1|e': 'Reference'},
 'row2': {'amt2|e': 'Amount',
  'desc2|e': 'Description',
  'pr2|e': 'Price',
  'qty2|e': 'Quantity',
  'ref2|e': 'Reference'},
 'settings': {'deliverydate': 'Date',
  'deliverynote': 'ALBARAN',
  'dir': "'.'",
  'type': 'ALBARAN'}}

In [0]:
type(Reading)

dict

In [0]:
Reading.keys()

dict_keys(['settings', 'row1', 'row2'])

In [0]:
Reading.values()

dict_values([{'dir': "'.'", 'type': 'ALBARAN', 'deliverynote': 'ALBARAN', 'deliverydate': 'Date'}, {'ref1|e': 'Reference', 'desc1|e': 'Description', 'qty1|e': 'Quantity', 'pr1|e': 'Price', 'amt1|e': 'Amount'}, {'ref2|e': 'Reference', 'desc2|e': 'Description', 'qty2|e': 'Quantity', 'pr2|e': 'Price', 'amt2|e': 'Amount'}])

In [0]:
Reading['row1']['ref1|e']

'Reference'

In [0]:
with pdfplumber.open('211559-050.pdf') as pdf:
    page = pdf.pages[0]
    text = page.extract_text()
    table = page.extract_tables()
    words = page.extract_words()

use plumber to extract_text for date and ALBARAN/TALLER info
then pypdf2 getFormTextFields() for dictionary of all remaining info needed.

In [0]:
print(text)

TALLER DELIVERY  NOTE
00211559_050
Tudela
Transportista El transportista
forwarder:
Matrícula
Nº Bultos Recibí
N. Boxes Signature:
Fecha / Date 16/08/2018
Cliente / Customer PHARMADEAL, S.A. Cod. Proveedor / Supplier  BRUNO,S.A.
No.
Dirección / AddressCTRA. MANZANA  Dirección / Address C.FUERTE,
RICO, S/N3350   S/N3350
PERROALTA PERROALTA
NIEVARRA  ES NIEVARRA ES
ORDER NO REFERENCE QUANTITY CONCEPT DESCRIPTION
El responsable de la entrega del residuo de envase o envase usado para su correcta gestión ambiental 
será el poseedor final
RECIBÍ CONFORME
1 de1


In [0]:
obj = PyPDF2.PdfFileReader('1900070.pdf')

In [0]:
fields = obj.getFields()
print(fields)

{'Text1': {'/FT': '/Tx', '/T': 'Text1', '/Ff': 0, '/AA': {}}, 'Ref1': {'/FT': '/Tx', '/T': 'Ref1', '/Ff': 0, '/V': '0001', '/AA': {}}, 'Ref2': {'/FT': '/Tx', '/T': 'Ref2', '/Ff': 0, '/V': '0002', '/AA': {}}, 'Desc1': {'/FT': '/Tx', '/T': 'Desc1', '/Ff': 0, '/V': 'This is a test', '/AA': {}}, 'Desc2': {'/FT': '/Tx', '/T': 'Desc2', '/Ff': 0, '/V': 'This is another test', '/AA': {}}, 'EUR_TOT': {'/FT': '/Tx', '/T': 'EUR_TOT', '/Ff': 0, '/AA': {}}, '%IVA': {'/FT': '/Tx', '/T': '%IVA', '/Ff': 0, '/AA': {}}, 'EUR_IVA': {'/FT': '/Tx', '/T': 'EUR_IVA', '/Ff': 0, '/AA': {}}, 'IMP_TOT': {'/FT': '/Tx', '/T': 'IMP_TOT', '/Ff': 0, '/AA': {}}, 'Qty1': {'/FT': '/Tx', '/T': 'Qty1', '/Ff': 0, '/V': '1', '/AA': {}}, 'Qty2': {'/FT': '/Tx', '/T': 'Qty2', '/Ff': 0, '/V': '2', '/AA': {}}, 'Pr1': {'/FT': '/Tx', '/T': 'Pr1', '/Ff': 0, '/V': '10', '/AA': {}}, 'Pr2': {'/FT': '/Tx', '/T': 'Pr2', '/Ff': 0, '/V': '5', '/AA': {}}, 'Amt1': {'/FT': '/Tx', '/T': 'Amt1', '/Ff': 0, '/V': '10', '/AA': {}}, 'Amt2': {'/FT'

In [0]:
textFields = obj.getFormTextFields()
print(textFields)

{'Text1': None, 'Ref1': '0001', 'Ref2': '0002', 'Desc1': 'This is a test', 'Desc2': 'This is another test', 'EUR_TOT': None, '%IVA': None, 'EUR_IVA': None, 'IMP_TOT': None, 'Qty1': '1', 'Qty2': '2', 'Pr1': '10', 'Pr2': '5', 'Amt1': '10', 'Amt2': '10'}


In [0]:
metadata = obj.getDocumentInfo()
print(metadata)

{'/CreationDate': "D:20190114113916+01'00'", '/ModDate': "D:20200405193901+02'00'", '/Producer': 'Amyuni Document Converter version 5.0.1.9', '/Title': 'Sage PDF Document'}


from pdfplumber methods above

In [0]:
print(table)

[[['Date', 'ALBARAN Nº:', None, None, None, None, None, None], ['14-01-19', '1900070', None, None, None, None, None, None], ['Cod. Client', 'Vencimiento', None, None, None, None, None, None], ['004301', '', None, None, None, None, None, None], ['Reference', 'Description', None, 'Quantity', None, 'Price', None, 'Amount'], ['', '', None, '', None, '', None, ''], [None, None, 'Base Amount', None, '% IVA', None, 'Amount Total', None], [None, None, '', None, '', None, '', None]]]


In [0]:
print(words)

[{'x0': Decimal('16.920'), 'x1': Decimal('30.723'), 'top': Decimal('13.716'), 'bottom': Decimal('28.913'), 'upright': 1, 'text': 'AAFBPP3NT9F9T9F9mRDCP%%TSTCP0N10T11TCO214444,oouauroeeOaaeL3EOeeA,,.bRE5a0Ie'}, {'x0': Decimal('25.388'), 'x1': Decimal('33.157'), 'top': Decimal('13.716'), 'bottom': Decimal('28.913'), 'upright': 1, 'text': '0338888ttnrmllxxofM1st.IDsR0..VrT'}, {'x0': Decimal('28.341'), 'x1': Decimal('38.767'), 'top': Decimal('13.716'), 'bottom': Decimal('28.913'), 'upright': 1, 'text': 'Laa.cFto5--77YDcAittV502eae77A.tailAlAAia.r055'}, {'x0': Decimal('35.148'), 'x1': Decimal('39.687'), 'top': Decimal('15.088'), 'bottom': Decimal('25.950'), 'upright': 1, 'text': 'oí7'}, {'x0': Decimal('35.754'), 'x1': Decimal('44.523'), 'top': Decimal('13.716'), 'bottom': Decimal('28.913'), 'upright': 1, 'text': 'rdoA:snB55aI'}, {'x0': Decimal('37.289'), 'x1': Decimal('44.894'), 'top': Decimal('13.716'), 'bottom': Decimal('28.913'), 'upright': 1, 'text': 'NiILCvRL'}, {'x0': Decimal('38.739'

In [0]:
def getPdfs(*dict: dict) -> list:
    path = [val for settings in dict for val in settings]
    allPdfs = [file for root, dirs, files in os.walk(path) for file in files if file.endswith('.pdf')]
    return allPdfs

In [0]:
pdfs = getPdfs('pdf1 (3)','pdf2 (2)')

TypeError: ignored

In [0]:
def readini(fname) -> list:
    # Instructions to read an ini file go here and/or below ...
    parser = configparser.ConfigParser()
    parser.read(fname)
    fieldsToRead = {section: dict(parser.items(section)) for section in parser.sections()}
    settings, row1, row2 = list(fieldsToRead)
    return settings, row1, row2

In [0]:
test1 = readini('pdf1 (3).ini')

In [0]:
t = [(i, j) for j in test1.index(1)]

ValueError: ignored

In [0]:
t = [ test1[i] for i in test1]

TypeError: ignored

In [0]:
def getPdfs(*dict: dict) -> list:
    path = [dict['settings']['dir'] for k, v in dict.items() for key in v if dict[key] == dict['settings']['dir']]
    allPdfs = [file for root, dirs, files in os.walk(path) for file in files if file.endswith('.pdf')]
    return allPdfs

In [0]:
t = getPdfs('pdf1.ini')

AttributeError: ignored