# JSON Processing

## Set Evns

In [12]:
import os, sys
import json
envs = {
    'pwd': os.getcwd(),
    'datapath': os.path.join(os.getcwd(), 'data'),
    'infilename': r'dictionary.json',
    'outfilename': r'etl_dictionary.json',
}
envs['infilepath'] =  os.path.join(envs['datapath'], envs['infilename'])
envs['outfilepath'] = os.path.join(envs['datapath'], envs['outfilename'])
envs

{'pwd': 'C:\\Users\\hpzju\\Desktop\\MyKB\\DataScience',
 'datapath': 'C:\\Users\\hpzju\\Desktop\\MyKB\\DataScience\\data',
 'infilename': 'dictionary.json',
 'outfilename': 'etl_dictionary.json',
 'infilepath': 'C:\\Users\\hpzju\\Desktop\\MyKB\\DataScience\\data\\dictionary.json',
 'outfilepath': 'C:\\Users\\hpzju\\Desktop\\MyKB\\DataScience\\data\\etl_dictionary.json'}

## ETL Data

In [24]:
inputfile = envs['infilepath']
outputfile = envs['outfilepath']
with open(inputfile, 'r') as ifo, open(outputfile, 'w+') as ofo:
    indata = json.load(ifo)
    for count, (k, v) in enumerate(indata.items()):
        continue
    else:
        print(f'Total lines: {count}')
    json.dump(indata, ofo, indent=4, separators=(', ', ': '))

    ofo.seek(0)
    outdata = ofo.read()
    print(f'''sizeof "{envs['infilename']}": {sys.getsizeof(indata)/1024/1024:.3} MB''')
    print(f'''sizeof "{envs['outfilename']}": {sys.getsizeof(outdata)/1024/1024:.3} MB''')

Total lines: 49536
sizeof "dictionary.json": 2.5 MB
sizeof "etl_dictionary.json": 4.67e-05 MB


# PDF Processing with pdfminer

## Refs

- pdfminer Check
    - Ref: https://euske.github.io/pdfminer/programming.html

## Set Envs

In [55]:
import os, sys
envs = {
    'pwd': os.getcwd(),
    'datapath': os.path.join(os.getcwd(), 'data'),
    'infilename': r'2003-Text Processing in Python.pdf',
    'outfilename': r'2003-Text Processing in Python.txt',
    'mobydick': r'mobydick.txt'
}
envs['infilepath'] =  os.path.join(envs['datapath'], envs['infilename'])
envs['outfilepath'] = os.path.join(envs['datapath'], envs['outfilename'])
envs['mobydickpath'] = os.path.join(envs['datapath'], envs['mobydick'])

envs

{'pwd': 'C:\\Users\\hpzju\\Desktop\\MyKB\\DataScience',
 'datapath': 'C:\\Users\\hpzju\\Desktop\\MyKB\\DataScience\\data',
 'infilename': '2003-Text Processing in Python.pdf',
 'outfilename': '2003-Text Processing in Python.txt',
 'mobydick': 'mobydick.txt',
 'infilepath': 'C:\\Users\\hpzju\\Desktop\\MyKB\\DataScience\\data\\2003-Text Processing in Python.pdf',
 'outfilepath': 'C:\\Users\\hpzju\\Desktop\\MyKB\\DataScience\\data\\2003-Text Processing in Python.txt',
 'mobydickpath': 'C:\\Users\\hpzju\\Desktop\\MyKB\\DataScience\\data\\mobydick.txt'}

## check file and filesystem

In [16]:
inputfile = envs['infilepath']
outputfile = envs['outfilepath']
with open(inputfile, 'br') as ifo, open(outputfile, 'w+') as ofo:
    indata = ifo.read()
    outdata = ofo.read()
    print(f'''sizeof "{envs['infilename']}": {sys.getsizeof(indata)/1024/1024:.3} MB''')
    print(f'''sizeof "{envs['outfilename']}": {sys.getsizeof(outdata)/1024/1024:.3} MB''')

sizeof "2003-Text Processing in Python.pdf": 1.51 MB
sizeof "2003-Text Processing in Python.txt": 4.67e-05 MB


## processing PDF to txt

In [17]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams, LTTextBoxHorizontal
from pdfminer.converter import PDFPageAggregator

In [23]:
# Open a PDF file.
with open(inputfile, 'br') as ifo, open(outputfile, 'w') as ofo:
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(ifo)
    
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser, password="")
    
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    # Create a PDF resource manager object that stores shared resources.
    rcmgr = PDFResourceManager()

    # Create a PDF device object.
    lap = LAParams()
    device = PDFPageAggregator(rcmgr, laparams=lap)

    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rcmgr, device)
    
    # Process each page contained in the document.
    for num, page in enumerate(PDFPage.create_pages(document)):
        interpreter.process_page(page)
        layout = device.get_result()
        for element in layout:
            if isinstance(element, LTTextBoxHorizontal):
                ofo.write(element.get_text())
     
    print(f"Done with total {num} pages ")

Done with total 504 pages 


# TXT Processing with re

In [50]:
from collections import Counter
import string
import re

In [58]:
processingfile = envs['outfilepath']
mobydickfile = envs['mobydickpath']

In [64]:
word_stat = Counter()
word_matcher = re.compile(r'(\w+)')
with open(mobydickfile, 'r') as fo:
    for index, line in enumerate(fo.readlines()):
        line=line.strip()
        words = word_matcher.split(line)[1::2]
        line_stat = Counter(words)
        word_stat.update(line_stat)
#         print(f'{index:>5}:"{line}":')
#         print(f'\t{line_stat}')
#         if index > 100:
#             break
    print(f'total lines: {index+1}')
    print(f'totoal words: {sum(word_stat.values())}')
    print(f'Most common TopN words:')
    for topn, word in enumerate(word_stat.most_common(30)):
        print(f'top {topn+1}: ', word)


total lines: 22333
totoal words: 222663
Most common TopN words:
top 1:  ('the', 13972)
top 2:  ('of', 6699)
top 3:  ('and', 6144)
top 4:  ('a', 4648)
top 5:  ('to', 4635)
top 6:  ('in', 3997)
top 7:  ('that', 2994)
top 8:  ('his', 2472)
top 9:  ('it', 2222)
top 10:  ('I', 2120)
top 11:  ('s', 1807)
top 12:  ('is', 1723)
top 13:  ('with', 1709)
top 14:  ('he', 1665)
top 15:  ('was', 1635)
top 16:  ('as', 1631)
top 17:  ('all', 1484)
top 18:  ('for', 1442)
top 19:  ('this', 1318)
top 20:  ('at', 1251)
top 21:  ('by', 1165)
top 22:  ('not', 1124)
top 23:  ('but', 1117)
top 24:  ('from', 1076)
top 25:  ('him', 1060)
top 26:  ('be', 1048)
top 27:  ('on', 1027)
top 28:  ('so', 919)
top 29:  ('whale', 911)
top 30:  ('one', 895)


# Tools and Patterns

## Tokenization

In [96]:
import re
from collections import namedtuple
token_pattern = {
    'NAME': r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)',
    'NUM': r'(?P<NUM>\d+)', 
    'OP': r'(?P<OP>[+\-*/])',
    'EQ': r'(?P<EQ>[=])',
    'WS': r'(?P<WS>\s+)'
}

print('|'.join([token_pattern[key] for key in token_pattern]))

pattern = re.compile('|'.join([token_pattern[key] for key in token_pattern]))

print(pattern)

(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)|(?P<NUM>\d+)|(?P<OP>[+\-*/])|(?P<EQ>[=])|(?P<WS>\s+)
re.compile('(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)|(?P<NUM>\\d+)|(?P<OP>[+\\-*/])|(?P<EQ>[=])|(?P<WS>\\s+)')


In [118]:
line = 'foo = 42+23-422*2/23'

sc = pattern.scanner(line)

In [119]:
Token = namedtuple('Token', ['Type','Value'])
def generate_tokens(pat, textline):
    scanner = pat.scanner(textline)
    for m in iter(scanner.match, None):
        yield Token(m.lastgroup, m.group())

In [120]:
for tok in generate_tokens(pattern, line):
    print(tok)

Token(Type='NAME', Value='foo')
Token(Type='WS', Value=' ')
Token(Type='EQ', Value='=')
Token(Type='WS', Value=' ')
Token(Type='NUM', Value='42')
Token(Type='OP', Value='+')
Token(Type='NUM', Value='23')
Token(Type='OP', Value='-')
Token(Type='NUM', Value='422')
Token(Type='OP', Value='*')
Token(Type='NUM', Value='2')
Token(Type='OP', Value='/')
Token(Type='NUM', Value='23')


## Simple Parser