---
# **Prerequisites**

**Install** dependency packages:

In [1]:
# For PDF to Text extraction
# See: https://github.com/jsvine/pdfplumber#python-library
!pip install pdfplumber



In [2]:
# For Thai spell checking & word correction
# See: https://github.com/PyThaiNLP/pythainlp
# See: https://www.bualabs.com/archives/3895/what-is-spell-checker-thai-language-spell-checker-pythainlp-spelling-correction-python-pythainlp-ep-3/
!pip install pythainlp



**Import** Dependencies

In [3]:
# For PDF file download
from pathlib import Path
import urllib3
# For PDF to Text extraction
import pdfplumber
# For Thai spell checking & word correction
import pythainlp
from pythainlp import spell
from pythainlp import correct
from pythainlp.util import normalize
from pythainlp.util import isthaichar
# Pandas & Numpy
import numpy as np
import pandas as pd
# Other std libs
import re

---
# **Specify Budget File URL (PDF)** 

##### **URL Dict**

In [4]:
PDF_FILE_URLS = {
    # งบกลาง สำนักนายกฯ กลาโหม
    "2022.3.1": "https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/8.pdf",
    # คลัง ตปท. ท่องเที่ยว พัฒนาสังคมฯ
    "2022.3.2": "https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/9.pdf",
    # อุดมฯ(1)
    "2022.3.3(1)": "https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/10.pdf",
    # อุดมฯ(2)
    "2022.3.3(2)": "https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/11.pdf",
    # อุดมฯ(3)
    "2022.3.3(3)": "https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/12.pdf",
    # อุดมฯ(4)
    "2022.3.3(4)": "https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/13.pdf",
    # อุดมฯ(5)
    "2022.3.3(5)": "https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/14.pdf",
    # เกษตรฯ
    "2022.3.4": "https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/15.pdf",
    # คมนาคม ดิจิทัลฯ
    "2022.3.5": "https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/16.pdf",
    # ทรัพย์ฯ
    "2022.3.6": "https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/17.pdf",
    # พลังงาน พาณิชย์
    "2022.3.7": "https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/18.pdf",
    # มหาดไทย
    "2022.3.8": "https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/19.pdf",
    # ยุติธรรม แรงงาน วัฒนธรรม
    "2022.3.9": "https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/20.pdf",
    # ศึกษา
    "2022.3.10": "https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/21.pdf",
    # สาธารณสุข
    "2022.3.11": "https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/22.pdf",
    # อุตสาหกรรม ไม่สังกัดนายกฯ
    "2022.3.12": "https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/23.pdf",
    # จว. กลุ่มจว. (1)
    "2022.3.13(1)": "https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/24.pdf",
    # จว. กลุ่มจว. (2)
    "2022.3.13(2)": "https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/25.pdf",
    # รัฐวิสาหกิจ
    "2022.3.14": "https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/26.pdf",
    # สภา ศาล องค์กรอิสระ
    "2022.3.15": "https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/27.pdf",
    # ท้องถิ่น (1)
    "2022.3.16(1)": "https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/28.pdf",
    # ท้องถิ่น (2)
    "2022.3.16(2)": "https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/29.pdf",
    # ท้องถิ่น (3)
    "2022.3.16(3)": "https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/30.pdf",
    # หน่วยงานอื่น กาชาด ส่วนราชการในพระองค์
    "2022.3.17": "https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/31.pdf"
}

##### **File to be run**

In [5]:
FALLBACK_REF_DOC = "2022.3.2"
PDF_FILE_URL = PDF_FILE_URLS[FALLBACK_REF_DOC]

PDF_FILE_URL

'https://raw.githubusercontent.com/kaogeek/thailand-budget-pdf2csv/main/budget-pdf/9.pdf'

---
# **Declare Global Vars**

#### **Configuration Vars**

In [6]:
# A global var controlling the default mode of misplaced Thai characters fix
DEFAULT_FIX_MISPLACED_CHARS = True
# A global var controlling space remove for `fix_misplaced_chars()`
FIX_REMOVE_SPACE = True
# A global var controlling upper char back shift for `fix_misplaced_chars()`
FIX_SHIFT_UPPER_CHAR = True

# To print debugging extracted text or not?
PRINT_DEBUG_TEXT = True

# A global var controlling the default mode of word correction
DEFAULT_USE_NLP = False

#### **Misspell Corrections**

In [7]:
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#!!! STILL NEED TO BE EDITED !!!
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

WEIRDOS = [
  ("ำา", "ำ"), ("สาหรับ", "สำหรับ"), ("จา่ ย", "จ่าย"), ("ละเอยี ด", "ละเอียด"), ("จา  แนก", "จำแนก"), ("ผลผลติ", "ผลผลิต"),
  ("บคุ ลากร", "บุคลากร"), ("ภาครฐั", "ภาครัฐ"), ("เงนิ", "เงิน"), ("ลว่ ง", "ล่วง"), ("แหลง่เงิน", "แหล่งเงิน"), ("อน่ื", "อื่น"),
  ("ต้งั้", "ตั้ง"), ("มผีล", "มีผล"), ("เน่อืง", "เนื่อง"), ("ถงึ", "ถึง"), ("ค่าจ้างประจา", "ค่าจ้างประจำ"), ("งบดา  เนินงาน", "งบดำเนินงาน"),
  ("พเิศษ", "พิเศษ"), ("เดอืน", "เดือน"), ("เตม็", "เต็ม"), ("ขนั้", "ขั้น"), ("วิสยัทศัน์", "วิสัยทัศน์"), ("พนัธกจิ", "พันธกิจ"), ("คลงั", "คลัง"),
  ("สว่น", "ส่วน"), ("จดัการ", "จัดการ"), ("ปฏิบตัิ", "ปฏิบัติ"), ("กา  กบั", "กำกับ"), ("พ้นื", "พื้น"), ("แข่งขนั", "แข่งขัน"), ("บรหิาร", "บริหาร"),
  ("ยทุธ", "ยุทธ"), ("ดิจทิลั", "ดิจิทัล"), ("ในกากบั", "ใบกำกับ"), ("ยทุธศาสตรพ์ฒันา", "ยุทธศาสตร์พัฒนา"), ("บรกิาร", "บริการ"), ("สงัคม", "สังคม"),
  ("พฒันา", "พัฒนา"), ("รกัษ์", "รักษ์"), ("ประจา  ปี", "ประจำปี"), ("ทง้ัส้นิ", "ทั้งสิ้น"), ("พสัดุ", "พัสดุ"), ("จดัเกบ็", "จัดเก็บ"), 
  ("ศลุกากร", "ศุลกากร"), ("เศรษฐกจิ", "เศรษฐกิจ"), ("ใชจ้่าย", "ใช้จ่าย"), ("สา  นัก", "สำนัก"), ("ทงั้ส้นิ", "ทั้งสิ้น"), ("ตงั้งบ", "ตั้งงบ"),
  ("ผูกพนังบ", "ผูกพันงบ"), ("ลงทนุ", "ลงทุน"), ("ยุทธศาสตรพ์ฒันา", "ยุทธศาสตร์พัฒนา"), ("เบ้ยี", "เบี้ย"), ("ใชส้อย", "ใช้สอย"), ("วสัดุ", "วัสดุ"),
  ("ปฏบิตังิาน", "ปฏิบัติงาน"), ("ทน่ีงั่", "ที่นั่ง"), ("ไมผู่กพนั", "ไม่ผูกพัน"), ("ทอ่งเท่ยีว", "ท่องเที่ยว"), ("กฬีา", "กีฬา"), ("เท่ียว", "เที่ยว"),
  ("อดุหนุน", "อุดหนุน"), ("ดาเนินงาน", "ดำเนินงาน"), ("อ่นื", "อื่น"), ("อ่ืน", "อื่น"), ("จาํแนก", "จำแนก")
]

#### **Contextual Vars**

In [8]:
# A global var keeping `REF_DOC` to be extracted from the THIRD page
REF_DOC = None
# A global var keeping current running no of item id
CUR_ITEM_ID = 0
# A global var keeping current extracting `MINISTRY`
CUR_MINISTRY = None
# A global var keeping current extracting `BUDGETARY_UNIT`
CUR_UNIT = None
# A global var keeping current extracting `BUDGET_PLAN`
CUR_PLAN = None
# A global var keeping current extracting `OUTPUT` or `PROJECT`
CUR_OUT_PROJ = None
# A global var keeping current extracting line item starting page index
CUR_ITEM_ST_PAGE_IDX = -1
# A global var keeping line items buffer of current `CUR_OUT_PROJ` or `CUR_PLAN` context.
CUR_ITEMS_BUFFER = []

# A output dict to be generated as pandas DataFrame
OUT = {
  "data": {
    "ITEM_ID": [],
    "REF_DOC": [],
    "REF_PAGE_NO": [],
    "MINISTRY": [],
    "BUDGETARY_UNIT": [],
    "CROSS_FUNC?": [],
    "BUDGET_PLAN": [],
    "OUTPUT": [],
    "PROJECT": [],
    "CATEGORY_LV1": [],
    "CATEGORY_LV2": [],
    "CATEGORY_LV3": [],
    "CATEGORY_LV4": [],
    "CATEGORY_LV5": [],
    "CATEGORY_LV6": [],
    "ITEM_DESCRIPTION": [],
    "FISCAL_YEAR": [],
    "AMOUNT": [],
    "OBLIGED?": []
  },
  "index": []
}

---
# **Declare Global Functions**

### **Declare** `find_all(substr: str, text: str)`

This method will return an array of positions of `substr` found in the given `text`.

In [9]:
def find_all(substr: str, text: str):
  if substr is None or text is None:
    return None

  return [m.start() for m in re.finditer(substr, text)]

### **Declare** `fix_misplaced_chars(input: str, useNLP=DEFAULT_USE_NLP)`

Since the most Thai characters returned from `pdfplumber`'s **`extract_text()`** function can be misplaced. We'll use this function to fix those weirdos. 

Note: By passing **`useNLP=True`**, this method will use **`pythainlp`** library for word corrections.

In [10]:
def remove_weirdo_space(input: str) -> str:
  if input is None:
    return None

  result = ""

  # Find all space bar positions occurred in text
  positions = find_all(" ", input)

  if len(positions) <= 0:
    # Return original input if no whitespace found
    return input

  lastPos = -1

  for i in positions:
    # Append current `input` chunk into `result`
    # Note: a chunk is a sub string of `input` split by space.
    result = result + input[lastPos+1:i]

    # For each space bar position, we'll remove it only if its prio- and post- character is non-digit Thai.
    # Ex: "รายละเอ ยีด" will be "รายละเอยีด", "งบบุคลากร   123 ๔ ๕ ๖" will still be the same.
    prior = "" if i <= 0 else input[i-1:i]
    post = "" if i >= len(input)-1 else input[i+1:i+2]

    if isthaichar(prior) and not contains_thai_digit(prior) and isthaichar(post) and not contains_thai_digit(post):
      # Remove this space! Do nothing
      pass
    else:
      # Leave space the same as it is. So, we need to append a space back into result
      result = result + " "

    lastPos = i

  # Append the last chunk before return
  result = result + input[lastPos+1:]
  
  return result

def back_shift_weirdo_upper_chars(input: str) -> str:
  if input is None:
    return None

  result = input

  # These upper chars cannot be placed ontop of Thai vowel chars
  # since it is syntactically incorrect.
  UPPER_CHARS = ["่", "้", "๊", "๋", "ั", "ิ", "ี", "ึ", "ื", "์"]
  VOWEL_CHARS = ["ะ", "า", "ุ", "ู", "โ"]

  for char in UPPER_CHARS:
    # Find all positions of this upper char occurred in text
    positions = find_all(char, input)

    # Shift upper char 1 position backward for all occurrences
    # if it is misplaced ontop of vowel chars.
    for i in positions:
      if i == 0:
        # This case is syntactically incorrect since the UPPER_CHARS
        # cannot be the first character of Thai words.      
        # So, remove it!

        # Please note that we've to insert whitespace here to let index
        # in `positions` still being the same, and, we'll strip() 
        # whitespace later
        result = " " + result[1:]
      else:
        # Check that it is ontop of vowel chars or not?
        baseChar = result[i-1:i]

        if baseChar in VOWEL_CHARS:
          # It is syntactically incorrect, shift it backward!
          result = result[:i-1] + char + result[i-1:i] + result[i+1:] 
        else:
          # Leave it as the same. Do nothing!
          pass

  return result.strip()

def fix_misplaced_chars(input: str, useNLP=DEFAULT_USE_NLP) -> str:
  if input is None:
    return input

  input = input.strip()

  result = input
  
  if not useNLP:

    # First, remove all weirdo whitespaces (if enabled)
    if FIX_REMOVE_SPACE:
      result = remove_weirdo_space(input)

    # Then, back shift the left misplaced upper chars :| (if enabled)
    if FIX_SHIFT_UPPER_CHAR:
      result = back_shift_weirdo_upper_chars(result)

    # Lastly, replace all predefined weirdos manually :P
    # Iterate through each weirdo tuple and replace them!!
    for w in WEIRDOS:
      result = result.replace(w[0] if not FIX_REMOVE_SPACE else remove_weirdo_space(w[0]), w[1])
  else:
    # This should be taking a very very long time to process ... :'(
    result = correct(input)

  return result

### **Declare** `contains_thai_digit(input: str)`

A function to check that the given `input` text contains thai digit or not.

In [11]:
def contains_thai_digit(input: str) -> str:
  if input is None:
    return False

  return input.find("๑") >= 0 or \
  input.find("๒") >= 0 or \
  input.find("๓") >= 0 or \
  input.find("๔") >= 0 or \
  input.find("๕") >= 0 or \
  input.find("๖") >= 0 or \
  input.find("๖") >= 0 or \
  input.find("๗") >= 0 or \
  input.find("๘") >= 0 or \
  input.find("๙") >= 0 or \
  input.find("๐") >= 0

### **Declare** `to_arabic(input: str)`

A function for Thai to Arabic number conversion.

In [12]:
def to_arabic(input: str) -> str:
  if input is None:
    return input

  return input\
  .replace("๑", "1")\
  .replace("๒", "2")\
  .replace("๓", "3")\
  .replace("๔", "4")\
  .replace("๕", "5")\
  .replace("๖", "6")\
  .replace("๖", "6")\
  .replace("๗", "7")\
  .replace("๘", "8")\
  .replace("๙", "9")\
  .replace("๐", "0")

### **Declare** `to_gregorian(input)`

A function for converting Buddhist calendar year into Gregorian calendar year. Please note that this function will return the same data type as given as an `input`.

In [13]:
def to_gregorian(input):
  if input is None:
    return input

  # If the given `input` is `int`, we'll return `int` value.
  # Otherwise, return `str`
  isInt = isinstance(input, int)

  # Try to parse anything into `int`
  input = int(input)
  # Gregorian calendar yar = Buddhist - 543
  gregorian = input - 543

  if isInt:
    return gregorian
  else:
    return str(gregorian)

### **Declare** `extract_text(page: Page|str, fixChar=DEFAULT_FIX_MISPLACED_CHARS, useNLP=DEFAULT_USE_NLP)`

A short-hand function which is equivalent to `to_arabic(fix_misplaced_chars(page.extract_text()))`.

In [14]:
def extract_text(page, fixChar=DEFAULT_FIX_MISPLACED_CHARS, useNLP=DEFAULT_USE_NLP) -> str:
  if page is None:
    return None

  # Presume that it is text
  text = page

  if not isinstance(page, str):
    # Or, it might be an instance of pdfplumber.Page

    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # Many thanks to `K.Feen S` for this kindly suggestion
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

    # We've to use crop technique to filter out 
    # non-displaying text box :'(
    w = page.width
    h = page.height

    text = page.within_bbox((0,0,w,h), relative=False).extract_text()
    # text = page.extract_text()

  return to_arabic(fix_misplaced_chars(text, useNLP) if fixChar else text)

### **Declare** `extract_ref_doc(page: Page|str, useNLP=DEFAULT_USE_NLP)`

A function for `REF_DOC` value extraction. The argument `page` should be the *THIRD* page of PDF document.

In [15]:
def extract_ref_doc(page, useNLP=DEFAULT_USE_NLP) -> str:
  if page is None:
    return None
    
  text = page if isinstance(page, str) else extract_text(page, False, useNLP)

  if text is None:
    return None

  start = -1
  end = -1

  # ปีงบประมาณ
  fy = "0"
  # ฉบับที่
  chabab = "0"
  # เล่มที่
  lem = "0"

  # Extract `FISCAL_YEAR` of this document
  start = text.find("พ.ศ.")

  if start >= 0:
    end = text.find("\n", start)
    # start+4 => len("พ.ศ.") is 4
    fy = to_gregorian(text[start+4:end].strip()) if end >= 0 else to_gregorian(text[start+4:].strip())

  # Extract `ฉบับที่` of this document
  start = text.find("ฉบับที่")

  if start >= 0:
    end = text.find("\n", start)
    # start+7 => len("ฉบับที่") is 7
    chabab = text[start+7:end].strip() if end >= 0 else text[start+7:].strip()

  # Extract `เล่มที่` of this document
  start = text.find("เล่มที่")

  if start >= 0:
    end = text.find("\n", start)
    # start+7 => len("เล่มที่") is 7
    lem = text[start+7:end].strip() if end >= 0 else text[start+7:].strip()

  if fy == "0" or chabab == "0" or lem == "0":
    return None

  return fy+"."+chabab+"."+lem.replace(" ", "")

### **Declare** `extract_page_no(page: Page|str, useNLP=DEFAULT_USE_NLP)`

A function for document's `page no` extraction. This is not PDF's page no, but, it's an arabic number placed at the top/bottom of the page.

In [16]:
def extract_page_no(page, useNLP=DEFAULT_USE_NLP) -> str:
  if page is None:
    return None

  text = page if isinstance(page, str) else extract_text(page, useNLP)

  if text is None:
    return None

  # Document page no should be placed at the top of the page
  # So, it should be in the first line.
  end = text.find("\n")

  pageNo = (text[:end] if end >= 0 else text).strip()

  try:
    # Check that the extracted page is an integer or not?
    int(pageNo)
  except:
    # In this case, it means that the extracted page no is not an integer.
    # So, it might mean that this `page` object contains no page no.
    return None

  # Return the extracted page if int() parsing is successful.
  return pageNo

### **Declare** `extract_line_items(pages: Page[]|str[], useNLP=DEFAULT_USE_NLP)`

A function for data `row` extraction. The argument `pages` is an array of `pdfplumber.Page` object.

In [17]:
def extract_line_items(pages, useNLP=DEFAULT_USE_NLP) -> []:
  if pages is None:
    return pages

  

  return None

---
# **Let's Run!**

### **Download** PDF file from the given url

In [18]:
http = urllib3.PoolManager()

# Declare local temp file path
temp = Path("temp.pdf")
# Download and write to temp
temp.write_bytes(http.request("GET", PDF_FILE_URL).data)



6458424

### **Read** PDF file using `pdfplumber`

In [19]:
pdf = pdfplumber.open("temp.pdf")

### **Extract** `REF_DOC` from the *THIRD* page of PDF

Note: Since the *FIRST* page is an image and the *SECOND* page is blank, the text content must be extracted from the *THIRD* page instead.

In [20]:
# We're passing useNLP=False since extracting REF_DOC does not need for NLP word correction.
REF_DOC = extract_ref_doc(pdf.pages[2], False)

if REF_DOC is not None:
  print("The extracted `REF_DOC` is: '" + REF_DOC + "'")
else:
  REF_DOC = FALLBACK_REF_DOC
  print("Using fallback `REF_DOC`: '" + REF_DOC + "'")

The extracted `REF_DOC` is: '2022.3.2'


### **Helper Functions**

##### extract_ministry(text: str)

In [21]:
def extract_ministry(text: str) -> str:
  if text is None:
    return None

  # This function will return None if current page is not a Ministry page

  # A ministry page must contain the word "เงินนอกงบประมาณ*"
  if text.find("เงินนอกงบประมาณ*") < 0:
    return None

  # Ministry (or equivalent government agency unit) name will be placed
  # at the top (the first line) of the page
  ministry = text.split("\n")[0].strip() if text.find("\n") >= 0 else None

  if ministry is None:
    return None

  if ministry.find(" ") >= 0:
    # Invalid pattern
    return None

  # A ministry level can be:
  # - กระทรวง*
  # - สำนัก*
  # - ส่วนราชการ*
  # - หน่วยงาน*
  # - จังหวัดและกลุ่มจังหวัด
  # - รัฐวิสาหกิจ
  # - องค์กรปกครองส่วนท้องถิ่น
  # - งบกลาง
  
  STARTS_WITH = ["กระทรวง", "สำนัก", "ส่วนราชการ", "หน่วยงาน"]
  EQUALS = ["จังหวัดและกลุ่มจังหวัด", "รัฐวิสาหกิจ", "องค์กรปกครองส่วนท้องถิ่น", "งบกลาง", "ทุนหมุนเวียน"]

  # Check starts with
  for s in STARTS_WITH:
    if ministry.startswith(s): 
      return ministry

  # Check equals
  for e in EQUALS:
    if ministry == e:
      return ministry

  # Not match any criteria
  return None  

##### extract_budgetary_unit(ministry: str, text: str) -> str

In [22]:
def extract_budgetary_unit(ministry: str, text: str) -> str:
  if text is None or ministry is None:
    return None

  # This function will return None if current page is not a budgetary unit page

  # A budgetary unit page must contain the word "เงินนอกงบประมาณ*"
  if text.find("เงินนอกงบประมาณ*") < 0:
    return None

  # Budgetary unit (or equivalent government agency unit) name will be
  # at the top of the page
  lines = text.split("\n")

  # The first line must be ministry name (or equivalent)
  mCheck = "" if lines[0] is None else lines[0].strip()
  # The second line will be budgetary unit
  unit = "" if lines[1] is None else lines[1].strip()

  # Check that the ministry name does match current context
  if mCheck != ministry:
    return None
    
  # Everything looks good!
  return unit  

##### extract_budget_plan(ministry: str, unit: str, text: str) -> str

In [23]:
def extract_budget_plan(ministry: str, unit: str, text: str) -> str:
  if text is None or ministry is None or unit is None:
    return None

  # This function will return None if current page is not a budget plan page

  # A budget plan page must contain "x.y แผนงาน*" where `x` and `y` is a number
  if not re.search("[0-9]+(\\.[0-9]+)? +แผนงาน.*", text):
    return None

  lines = text.split("\n")

  # Budget plan should be in the top 5 lines of the page
  for i in range(0, 5):
    plan = "" if lines[i] is None else lines[i].strip()

    # If this line stars with "x.y แผนงาน*", extract budget plan 
    # name from this line.
    if re.search("^[0-9]+(\\.[0-9]+)? +แผนงาน.* .*", plan):
      # Strip "x.y" off
      plan = plan[plan.find(" ")+1:].strip()
      # Strip "<amount> บาท" off
      return plan[0:plan.find(" ")]
  
  # No budget plan found in top 5 lines of the page
  return None  

##### extract_output_project(ministry: str, unit: str, plan: str, text: str) -> str

In [24]:
def extract_output_project(ministry: str, unit: str, plan: str, text: str) -> str:
  if text is None or ministry is None or unit is None or plan is None:
    return None

  # This function will return None if current page is not a output/project page

  # A budget plan page must contain "x.y.z (ผลผลิตที่|โครงการที่)*" where `x`, `y` and `z` is a number
  if not re.search("[0-9]+(\\.[0-9]+)? +(ผลผลิต|โครงการ).*", text):
    return None

  lines = text.split("\n")

  # Output/project should be in the top 5 lines of the page
  for i in range(0, 5):
    outProj = "" if lines[i] is None else lines[i].strip()

    # If this line stars with "x.y.z (ผลผลิตที่|โครงการที่)*", extract output/project 
    # name from this line.
    if re.search("[0-9]+(\\.[0-9]+)? +(ผลผลิต|โครงการ).*", outProj):
      # Strip "ผลผลิตที่:|โครงการที่:" off
      outProj = outProj[outProj.find(":")+1:].strip()

      if outProj.find(" บาท") < 0:
        # This is multi-line output/project name
        # We've to look for its full name a little bit further.
        for j in range(i + 1, i + 6):
          append = "" if lines[j] is None else lines[j].strip()
          
          # Append output/project name
          outProj = outProj + append

          if append.endswith(" บาท"):
            # End of name found!
            break
            
      # Strip the word " บาท" off!
      outProj = outProj[0:outProj.rindex(" ")]

      # Strip all "amount" number off!
      stripTo = len(outProj)
      while stripTo > 0:
        if isthaichar(outProj[stripTo-1: stripTo]):
          # Thai character found!
          break

        stripTo = stripTo - 1

      return outProj[0:stripTo]
  
  # No output/project found in top 5 lines of the page
  return None  

##### is_line_item_start_page(ministry: str, unit: str, plan: str, outProj: str, text: str) -> bool

In [25]:
def is_line_item_start_page(ministry: str, unit: str, plan: str, outputProject: str, text: str) -> bool:
  # This method is not for "งบกลาง"
  # So, the budget plan must be specified
  # In some cases, like "แผนงานบุคลากรภาครัฐ", may not contain output/project,
  # hence, we're not going to check output/project here.
  if text is None or ministry is None or unit is None or plan is None:
    return False

  # A line item starting page must contain keyword "รายละเอียดงบประมาณจำแนกตามงบรายจ่าย"
  if text.find("รายละเอียดงบประมาณจำแนกตามงบรายจ่าย") < 0:
    return False

  if plan == "แผนงานบุคลากรภาครัฐ":
    # If it is "แผนงานบุคลากรภาครัฐ" plan, we'll return true immediately since
    # there is no output/project in this plan.
    return True

  if outputProject is None:
    # Otherwise, the output/project must be specified.
    return False

  return True

  '''
  # We've to look into multiple lines since an output/project name might 
  # be written in a couple of lines.
  lines = text.split("\n")
  
  # Output/project should be in the top 5 lines of the page
  for i in range(0, 5):
    if i >= len(lines): break

    outProj = "" if lines[i] is None else lines[i].strip()

    # If this line stars with "(ผลผลิต|โครงการ) : .*", extract output/project 
    # name from this line.
    if re.search("(ผลผลิต|โครงการ) *\\: *.*", outProj):
      return True

  return False
  '''

##### is_line_item_end_page(ministry: str, unit: str, plan: str, outProj: str, text: str) -> bool

In [26]:
def is_line_item_end_page(ministry: str, unit: str, plan: str, outputProject: str, text: str) -> bool:
  if text is None:
    return True
  
  # Line item is in section "7" of each budgetary unit.
  # So, if we see section "8.รายงานสถานะและแผนการใช้จ่ายเงินนอกงบประมาณ" at the first
  # 5 lines of the page, it means that this is the end of line item.
  lines = text.split("\n")

  for i in range(0, 5):
    if i >= len(lines): break

    if re.search("^8\\. *รายงานสถานะและแผนการ.*", lines[i]):
      return True

  return False

##### extract_line_items(pageNo: str, text: str) -> []
Since some of line items are written in multiple lines, this function will collapse multi-line items into single line item tuples.

**Returns** an array of single-line item tuples being sent to be processed via **process_line_items()** function.

**Syntax:** Each line item must end with " บาท" keyword.

In [27]:
def extract_line_items(pageNo: str, text: str) -> []:
  if text is None:
    return []

  result = []

  lines = text.split("\n")

  # Allowed regex
  regexs = [
    "^[0-9]+\\.(([0-9]+\\.)*[0-9]+)?.*",
    "^\\(?[0-9]+(\\.[0-9]+)*\\).*",
    "^วงเงินทั้งสิ้น.*",
    "^ปี [0-9]{2,4}.*"
  ]

  stackChars = "(0123456789.)"
  amountChars = "0123456789,. "

  buffer = None
  for line in lines:
    line = line.strip()

    if line == "" or line is None:
      continue

    if buffer is None:
      # No line item starting point found yet.
      # Check that the line does match line item 
      # starting point regexs or not?
      match = False

      for regex in regexs:
        if re.search(regex, line):
          match = True
          break
      
      if match:
        buffer = line
    else:
      # Buffer != none means the starting point of 
      # line item is found. Append this line to buffer.
      buffer = buffer + line

    if buffer is not None:
      # See that this is the ending line or not?
      if buffer.find(" บาท") >= 0:
        # Ending line found

        # Strip " บาท" off!
        buffer = buffer[0:buffer.rindex(" บาท")]

        # Split item "stack" vs "description" vs "amount"
        descIdx = 0
        amountIdx = len(buffer)

        last_char = None

        # Find description start index
        while descIdx < len(buffer):
          curChar = buffer[descIdx: descIdx+1]

          if stackChars.find(curChar) < 0:
            # Non-stack character found!
            break

          last_char = curChar

          descIdx = descIdx + 1

        last_char = None

        # Find amount start index
        while amountIdx > 0:
          curChar = buffer[amountIdx-1: amountIdx]

          if amountChars.find(curChar) < 0:
            # Non-amount character found!
            break

          if last_char == " " and curChar == " ":
            # Double space found!
            break

          last_char = curChar

          amountIdx = amountIdx - 1

        item_stack = buffer[0:descIdx].strip()
        item_desc = buffer[descIdx:amountIdx].strip()
        item_amnt = buffer[amountIdx:].strip().replace(" ", "").replace(",", "")

        try:
          item_amnt = float(item_amnt)
        except:
          print("Cannot parse item amount to float: " + item_desc + " = " + item_amnt) 

        # Append line item tuple into result
        result.append((item_stack, item_desc, item_amnt, pageNo))

        # Reset buffer to find next item
        buffer = None

  return result

##### get_parent_stack_no(stackNo: str) -> str

In [28]:
def get_parent_stack_no(stackNo: str) -> str:
  if stackNo is None:
    return None
  
  has_par_l = stackNo.startswith("(")
  has_par_r = stackNo.endswith(")")

  stackNo = stackNo.replace("(", "").replace(")", "")

  if stackNo.endswith("."):
    stackNo = stackNo[0:len(stackNo)-1]

  if stackNo.find(".") < 0:
    # This is root level
    return None
  
  parent = stackNo[0:stackNo.rindex(".")]

  if has_par_l or has_par_r:
    return ("(" if has_par_l else "") + parent + ")" if has_par_r else ""   
  
  return parent if parent.find(".") >= 0 else parent + "."

##### process_line_items(ministry: str, unit: str, plan: str, outProj: str, curItemId: int, lines: [], out: {}) -> int

**Returns** New value of `CUR_ITEM_ID`

In [29]:
def process_line_items(ministry: str, unit: str, plan: str, outProj: str, curItemId: int, lines: [], out: {}):
  if lines is None or len(lines) <= 0:
    return curItemId

  lastLine = None
  # [("1.", "lvl 1 description"), ("1.1", "lvl 2 description"), ...]
  curStack = []
  obliged_st_line = None
  last_out_line = None

  for l in range(0, len(lines)):
    line = lines[l]
    nextLine = lines[l + 1] if l < len(lines) - 1 else None

    item_stack = line[0]
    item_stack_no = item_stack.replace("(", "").replace(")", "")
    item_desc = line[1]
    item_amnt = line[2]
    item_page = line[3]
    item_fy = [""]

    # The following cases will be ignored:
    if item_desc.startswith("วงเงินทั้งสิ้น") or \
      item_desc.startswith("เงินนอกงบประมาณ"):
      continue

    if nextLine is not None:
      if nextLine[1].startswith("วงเงินทั้งสิ้น"):
        obliged_st_line = line
        continue

    # Check that current line has child(ren)/descendant(s) or not?
    # If yes, put this line "description" into stack and do not
    # write this line to "out". Otherwise, write this line to 
    # "out" with current context & stack.
    hasChild = False

    valid = True

    if item_stack != "":
      # This is data line item
      # Reset obliged start line
      obliged_st_line = None

      if item_stack.find("(") < 0 and item_stack.find(")") < 0:
        # Current line is main category "x.y.z"
        if nextLine is None:
          # There is no further line
          # So, current line must be leaf, send it to OUT
          hasChild = False
        else:
          if nextLine[0].find("(") < 0 and nextLine[0].find(")") < 0:
            # Next line is also main category "a.b.c"
            next_stack_no = nextLine[0]

            # Check that the next line is a child of this line or not?
            if next_stack_no == item_stack_no:
              if item_stack_no == "1.":
                # This is the case:
                # 1. PARENT
                # 1. CHILD
                hasChild = True
              else:
                # This should not happen!
                valid = False
            elif next_stack_no.startswith(item_stack_no):
              # This is the case:
              # 1. PARENT
              # 1.1 CHILD
              hasChild = True
            else:
              # This is the case:
              # 1. A
              # 2. B: A slibing of A
              # -- or --
              # 1.1 A
              # 1.2 B: A sibling of  A
              # -- or --
              # 1.1 A
              # 2. B: A sibling of parent of A
              # -- or --
              # 1.1.1 A
              # 1.2. B: A sibling of parent of A
              hasChild = False
          elif nextLine[0] != "":
            # Next line is sub item "(a.b.c)" !!!
            # So, we're sure that the current line must have child(ren)
            
            # This is the case:
            # 1. A
            # (x) B: A child of A
            hasChild = True
      else:
        # Current line is sub item "(x.y.z)"
        if nextLine is None:
          # There is no further line
          # So, current line must be leaf, send it to OUT
          hasChild = False
        else:
          if nextLine[0].find("(") < 0 and nextLine[0].find(")") < 0:
            # Next line is main category "a.b.c" !!!
            # So, we're sure that the current line must be leaf

            # This is the case:
            # (1) A
            # X. B: A sibling of parent of A
            hasChild = False
          elif nextLine[0] != "":
            # Next line is sub item "(a.b.c)"

            next_stack_no = nextLine[0].replace("(", "").replace(")", "")

            # Check that the next line is a child of this line or not?
            if next_stack_no == item_stack_no:
              if item_stack_no == "1":
                # This is the case:
                # (1) PARENT
                # (1) CHILD
                hasChild = True
              else:
                # This should not happen!
                valid = False
            elif next_stack_no.startswith(item_stack_no):
              # This is the case:
              # (1) PARENT
              # (1.1) CHILD
              hasChild = True
            else:
              # This is the case:
              # (1) A
              # (2) B: A slibing of A
              # -- or --
              # (1.1) A
              # (1.2) B: A sibling of  A
              # -- or --
              # (1.1) A
              # (2) B: A sibling of parent of A
              # -- or --
              # (1.1.1) A
              # (1.2) B: A sibling of parent of A
              hasChild = False
    elif item_desc.startswith("ปี "):
      # This is obliged line item
      # Strip "ปี " off!
      fy = item_desc[3:].strip()

      # Current line is fiscal year row
      fyChar = "0123456789- "

      enIdx = len(fy)
      for i in range(0, len(fy)):
        if fyChar.find(fy[i]) < 0:
          enIdx = i
          break
      
      fy = fy[0:enIdx].strip().replace(" ","")

      if fy.find("-") >= 0:
        fySplit = fy.split("-")

        y_start = -1
        y_end = -1

        try:
          y_start = int(fySplit[0])
          y_end = int(fySplit[1])
        except:
          pass
        
        if y_start < 0 or y_end < 0:
          # Cannot parse fy start/end
          print("!!!!!!!!!!!!!!!")
          print("!!!! ERROR !!!!")
          print("!!!!!!!!!!!!!!!")
          print("Cannot parse fiscal year start/end for line: "+item_desc)
          print("This FISCAL_YEAR will be used: "+fy)
            
          item_fy = [fy]
        else:
          item_fy = []
          for i in range(y_start, y_end + 1):
            item_fy.append(to_gregorian(str(i)))
      else:
        item_fy = [to_gregorian(fy)]

      # Mark this line as a valid data line to be sent to OUT
      valid = True
    
    if not valid:
      # Unsupported case found!
      # Ignore this line & print message for debug
      print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
      print("!!!! UNSUPPORTED LINE FORMAT !!!!")
      print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
      print(line)
      print("will be ignored.")
      continue

    if hasChild:
      # Put this line into stack and process the next line
      curStack.append((line[0], line[1]))
    else:
      for fy in item_fy:
        # Write this line to "out" with current contenxt & stack
        item_id = REF_DOC + "." + str(curItemId)

        curItemId = curItemId + 1

        # Append index row
        out["index"].append(item_id)
        # Append data row
        out["data"]["ITEM_ID"].append(item_id)
        out["data"]["REF_DOC"].append(REF_DOC)
        out["data"]["REF_PAGE_NO"].append(item_page)
        out["data"]["MINISTRY"].append(ministry)
        out["data"]["BUDGETARY_UNIT"].append(unit)
        out["data"]["CROSS_FUNC?"].append(plan is not None and plan.startswith("แผนงานบูรณาการ"))
        out["data"]["BUDGET_PLAN"].append(plan)
        out["data"]["OUTPUT"].append(outProj if outProj is not None and not outProj.startswith("โครงการ") else "")
        out["data"]["PROJECT"].append(outProj if outProj is not None and outProj.startswith("โครงการ") else "")

        # Write current sub category stack into OUT
        for s in range(0, len(curStack)):
          if s >= 6:
            print("!!!!!!!!!!!!!!!!!!!!!!!!")
            print("!!!! STACK OVERFLOW !!!!")
            print("!!!!!!!!!!!!!!!!!!!!!!!!")
            print("Category Level "+str(s+1)+": "+curStack[s][1])
            print("will be ignored.")
            continue
          
          out["data"]["CATEGORY_LV"+str(s+1)].append(curStack[s][1])
        # Fill in CATEGORY_LV*
        if len(curStack) < 6:
          for s in range(len(curStack)+1, 7):
            out["data"]["CATEGORY_LV"+str(s)].append("")
          
        out["data"]["ITEM_DESCRIPTION"].append(item_desc if obliged_st_line is None else obliged_st_line[1])
        out["data"]["FISCAL_YEAR"].append(fy if fy != "" else REF_DOC[:REF_DOC.find(".")])
        out["data"]["AMOUNT"].append(item_amnt)
        out["data"]["OBLIGED?"].append(fy != "")
      
      # Pop current stack out to be matched the next line's parent
      if nextLine is None:
        curStack = []
      else:
        if nextLine[0] is not None and nextLine[0] != "":
          nextLineParent = get_parent_stack_no(nextLine[0])
          
          if nextLineParent is None or nextLineParent == "":
            if nextLine[0].find("(") >= 0 or nextLine[0].find(")") >= 0:
              # Next line stack no must be in "(x)" format
              nextLineStackNo = int(nextLine[0].replace("(", "").replace(")", ""))
              nextLineParentStack = ("(" if nextLine[0].find("(") >= 0 else "") + str(nextLineStackNo - 1) + (")" if nextLine[0].find(")") >= 0 else "")
            
              lastOutStack = None if last_out_line is None else last_out_line[0]

              if nextLineParentStack == lastOutStack or nextLineParentStack == item_stack:
                # The next line should be sibling of current line
                # since its stack no is incremental.
                # So, we're going to leave current stack being the same

                # Example of this case:
                # (1) A
                # (2) B: A sibling of A
                pass
              else:
                # Pop current stack off until reaching the
                # same level of the next line's parent !!!

                # Example of this case:
                # (1) A
                # (1) B: A child of A <<< This stack will be popped out from while loop
                # (1) C: A child of B
                # (2) D: A sibling of A
                while len(curStack) > 0 and curStack[-1][0] != nextLineParentStack:
                  curStack.pop()

                # Pop another level off since next line is a sibling
                # of current line

                # Example of this case:
                # (1) A <<< Pop this out also since D is a sibling of A
                # (1) B: A child of A 
                # (1) C: A child of B
                # (2) D: A sibling of A
                if len(curStack) > 0: curStack.pop()
            else:
              # Next line is root level
              # Clear all stacks off!
              curStack = []
          else:
            # Pop current stack off until reaching the
            # same level of the next line's parent !!!

            # Example of this case:
            # 1. A 
            # 1.1 B: A child of A <<< Pop this out
            # 1.1.1 C: A child of B
            # 1.2 D: A child of A
            while len(curStack) > 0 and curStack[-1][0] != nextLineParent:
              curStack.pop()
        else:
          # The next line might be fiscal year row
          # So, we'll leave current stack being the same
          pass
      
      last_out_line = obliged_st_line if obliged_st_line is not None else line

  return curItemId

### **Extract** data from entire document

**State Machine** (for each `page`)
* **if** `CUR_MINISTRY` is `None` => find_ministry()
* **if** end of ministry is found => reset `CUR_MINISTRY` `CUR_UNIT` `CUR_PLAN` and `CUR_OUT_PROJ`
* **if** `CUR_UNIT` is `None` => find_budgetary_unit()
* **if** end of budgetary unit is found => reset `CUR_UNIT` `CUR_PLAN` and `CUR_OUT_PROJ`
* **if** `CUR_PLAN` is `None` => find_budget_plan()
* **if** end of budget plan is found => reset `CUR_PLAN` and `CUR_OUT_PROJ`
* **if** `CUR_OUT_PROJ` is `None` => find_output_project()
* **if** end of output or project is found => reset `CUR_OUT_PROJ`
* **extract** each line item with `CUR_*` context

In [30]:
# Reset context vars
i = 0
CUR_ITEM_ID = 0
CUR_MINISTRY = None
CUR_UNIT = None
CUR_PLAN = None
CUR_OUT_PROJ = None
CUR_ITEM_ST_PAGE_IDX = -1
CUR_ITEMS_BUFFER = []

OUT = {
  "data": {
    "ITEM_ID": [],
    "REF_DOC": [],
    "REF_PAGE_NO": [],
    "MINISTRY": [],
    "BUDGETARY_UNIT": [],
    "CROSS_FUNC?": [],
    "BUDGET_PLAN": [],
    "OUTPUT": [],
    "PROJECT": [],
    "CATEGORY_LV1": [],
    "CATEGORY_LV2": [],
    "CATEGORY_LV3": [],
    "CATEGORY_LV4": [],
    "CATEGORY_LV5": [],
    "CATEGORY_LV6": [],
    "ITEM_DESCRIPTION": [],
    "FISCAL_YEAR": [],
    "AMOUNT": [],
    "OBLIGED?": []
  },
  "index": []
}

last_page_no = None

for page in pdf.pages:
  # Extract text from page
  text = extract_text(page)
  pageNo = (lambda x : "" if x is None else x)(extract_page_no(text))

  if pageNo is None and last_page_no is not None:
    # Try to guess it!
    pageNo = str(int(last_page_no)+1)

  last_page_no = pageNo

  print("==========")
  print("PAGE: "+str(i)+", no: "+pageNo)
  print("----------")
  print("REF_DOC: "+("" if REF_DOC is None else REF_DOC)+\
        ", CUR_ITEM_ID: "+str(CUR_ITEM_ID)+\
        ", CUR_MINISTRY: "+("" if CUR_MINISTRY is None else CUR_MINISTRY)+\
        ", CUR_UNIT: "+("" if CUR_UNIT is None else CUR_UNIT)+\
        ", CUR_PLAN: "+("" if CUR_PLAN is None else CUR_PLAN)+\
        ", CUR_OUT_PROJ: "+("" if CUR_OUT_PROJ is None else CUR_OUT_PROJ))
  print("----------")

  # Try to extract ministry from page
  newMinistry = extract_ministry(text)

  if newMinistry is not None and newMinistry != CUR_MINISTRY:
    # A new ministry is found!

    # Flush current line items buffer into OUT
    CUR_ITEM_ID = process_line_items(CUR_MINISTRY, CUR_UNIT, CUR_PLAN, CUR_OUT_PROJ, CUR_ITEM_ID, CUR_ITEMS_BUFFER, OUT)

    # Set new ministry into contextual var
    CUR_MINISTRY = newMinistry

    # Reset sub ministry context:
    CUR_UNIT = None
    CUR_PLAN = None
    CUR_OUT_PROJ = None
    CUR_ITEM_ST_PAGE_IDX = -1
    CUR_ITEMS_BUFFER = []

    print("> FOUND NEW MINISTRY: " + CUR_MINISTRY)
  elif CUR_MINISTRY is not None:
    # This is a page within ministry

    # Try to extract budgetary unit from page
    newUnit = extract_budgetary_unit(CUR_MINISTRY, text)

    if newUnit is not None and newUnit != CUR_UNIT:
      # A new budgetary unit is found!

      # Flush current line items buffer into OUT
      CUR_ITEM_ID = process_line_items(CUR_MINISTRY, CUR_UNIT, CUR_PLAN, CUR_OUT_PROJ, CUR_ITEM_ID, CUR_ITEMS_BUFFER, OUT)

      # Set a new budgetary unit into contextual var
      CUR_UNIT = newUnit

      # Reset sub budgetary unit context:
      CUR_PLAN = None
      CUR_OUT_PROJ = None
      CUR_ITEM_ST_PAGE_IDX = -1
      CUR_ITEMS_BUFFER = []

      print("> FOUND NEW BUDGETARY UNIT: " + CUR_UNIT)
    elif CUR_UNIT is not None:
      # This is a page within budgetary unit

      # Try to extract budget plan from page
      newPlan = extract_budget_plan(CUR_MINISTRY, CUR_UNIT, text)

      if newPlan is not None and newPlan != CUR_PLAN:
        # A new budget plan is found!

        # Flush current line items buffer into OUT
        CUR_ITEM_ID = process_line_items(CUR_MINISTRY, CUR_UNIT, CUR_PLAN, CUR_OUT_PROJ, CUR_ITEM_ID, CUR_ITEMS_BUFFER, OUT)

        # Set a new budget plan into contextual var
        CUR_PLAN = newPlan

        # Reset sub budget plan context:
        CUR_OUT_PROJ = None
        CUR_ITEM_ST_PAGE_IDX = -1
        CUR_ITEMS_BUFFER = []

        print("> FOUND NEW BUDGET PLAN: " + CUR_PLAN)
      
      # The first output/project of a budget plan may be in
      # the same page of its parent plan.

      # So, we're going to extract output/project from this 
      # page imediately after a new plan is found.
      if CUR_PLAN is not None:
        # This is a page within budget plan

        # Try to extract output/project from page
        newOutProj = extract_output_project(CUR_MINISTRY, CUR_UNIT, CUR_PLAN, text)

        if newOutProj is not None and newOutProj != CUR_OUT_PROJ:
          # A new output/project is found!

          # Flush current line items buffer into OUT
          CUR_ITEM_ID = process_line_items(CUR_MINISTRY, CUR_UNIT, CUR_PLAN, CUR_OUT_PROJ, CUR_ITEM_ID, CUR_ITEMS_BUFFER, OUT)

          # Set a new output/project into contextual var
          CUR_OUT_PROJ = newOutProj

          # Reset line item start page index
          CUR_ITEM_ST_PAGE_IDX = -1
          CUR_ITEMS_BUFFER = []

          print("> FOUND NEW OUTPUT/PROJECT: " + CUR_OUT_PROJ)
        elif CUR_OUT_PROJ is not None or CUR_PLAN == "แผนงานบุคลากรภาครัฐ":
          # Note: "แผนงานบุคลากรภาครัฐ" may not contain output/project.
          # So, the CUR_OUT_PROJ can be none for this plan.

          isStartPage = is_line_item_start_page(CUR_MINISTRY, CUR_UNIT, CUR_PLAN, CUR_OUT_PROJ, text)

          if CUR_ITEM_ST_PAGE_IDX < 0 and isStartPage:
            CUR_ITEM_ST_PAGE_IDX = i

            print("> FOUND LINE ITEM START PAGE IDX: " + str(CUR_ITEM_ST_PAGE_IDX))
          
          if CUR_ITEM_ST_PAGE_IDX >= 0:
            if is_line_item_end_page(CUR_MINISTRY, CUR_UNIT, CUR_PLAN, CUR_OUT_PROJ, text):
              # End of line item page found!
              print("> FOUND LINE ITEM END PAGE IDX: " + str(i))

              # Flush current line items buffer into OUT
              CUR_ITEM_ID = process_line_items(CUR_MINISTRY, CUR_UNIT, CUR_PLAN, CUR_OUT_PROJ, CUR_ITEM_ID, CUR_ITEMS_BUFFER, OUT)

              # Reset context vars
              if CUR_PLAN == "แผนงานบุคลากรภาครัฐ":
                CUR_PLAN = None

              CUR_OUT_PROJ = None
              CUR_ITEM_ST_PAGE_IDX = -1
              CUR_ITEMS_BUFFER = []
            else:
              # This page is still in line item page range
              # Do line item extraction and extend it into buffer
              print("> Extracting line items ...")
              print()
              print(text)

              CUR_ITEMS_BUFFER.extend(extract_line_items(pageNo, text))
          else:
            # No line item start page found yet :|
            # Print `text` for debugging purpose.
            print("> DEBUG: Looking for line item starting page ...")
            if PRINT_DEBUG_TEXT:
              print()
              print(text)
        else:
          # No output/project found yet :|
          # We've to find a new output/project first

          # Print `text` for debugging purpose.
          print("> DEBUG: Looking for output/project ...")
          if PRINT_DEBUG_TEXT:
            print()
            print(text)
      else:
        # No budget plan found yet :|
        # Print `text` for debugging purpose.
        print("> DEBUG: Looking for budget plan ...")
        if PRINT_DEBUG_TEXT:
          print()
          print(text)
    elif CUR_MINISTRY == "งบกลาง":
      # Note: "งบกลาง" is a ministry level unit but does not 
      # contain budgetary unit and output/project. 
      # So, we'll handle it separately.

      # !!!!!!!!!!!!!!!!!!!!!!
      # !!! IMPLEMENT THIS !!!
      # !!!!!!!!!!!!!!!!!!!!!!
      pass
    else:
      # No bugetary unit found yet :|
      # We've to find a new budgetary unit first

      # Print `text` for debugging purpose.
      print("> DEBUG: Looking for budgetary unit ...")
      if PRINT_DEBUG_TEXT:
        print()
        print(text)
  else:
    # No ministry found yet :|
    # We've to find a new ministry first

    # Print `text` for debugging purpose.
    print("> DEBUG: Looking for ministry ...")
    if PRINT_DEBUG_TEXT:
      print()
      print(text)

  i += 1

# Flush last line items left in buffer into OUT
CUR_ITEM_ID = process_line_items(CUR_MINISTRY, CUR_UNIT, CUR_PLAN, CUR_OUT_PROJ, CUR_ITEM_ID, CUR_ITEMS_BUFFER, OUT)

[1;30;43mเอาต์พุตของการสตรีมมีการตัดเหลือเพียง 5000 บรรทัดสุดท้าย[0m
REF_DOC: 2022.3.2, CUR_ITEM_ID: 1526, CUR_MINISTRY: กระทรวงการท่องเที่ยวและกีฬา, CUR_UNIT: องคก์ารบริหารการพัฒนาพ้ืนท่ีพิเศษเพ่ือการท่องเที่ยวอย่างยงั่ยืน (องคก์ารมหาชน), CUR_PLAN: แผนงานบุคลากรภาครัฐ, CUR_OUT_PROJ: 
----------
> FOUND NEW BUDGET PLAN: แผนงานพื้นฐานด้านการสร้างความสามารถในการแข่งขัน
> FOUND NEW OUTPUT/PROJECT: พื้นท่ที่มีีศกัยภาพด้านการท่องเท่ยีวไดร้บัการพัฒนาเพ่อืยกระดบั
PAGE: 340, no: 
----------
REF_DOC: 2022.3.2, CUR_ITEM_ID: 1526, CUR_MINISTRY: กระทรวงการท่องเที่ยวและกีฬา, CUR_UNIT: องคก์ารบริหารการพัฒนาพ้ืนท่ีพิเศษเพ่ือการท่องเที่ยวอย่างยงั่ยืน (องคก์ารมหาชน), CUR_PLAN: แผนงานพื้นฐานด้านการสร้างความสามารถในการแข่งขัน, CUR_OUT_PROJ: พื้นท่ที่มีีศกัยภาพด้านการท่องเท่ยีวไดร้บัการพัฒนาเพ่อืยกระดบั
----------
> FOUND LINE ITEM START PAGE IDX: 340
> Extracting line items ...

รายละเอียดงบประมาณจำแนกตามงบรายจ่าย
รายละเอียดงบประมาณจำแนกตามงบรายจ่าย
ผลผลิต : พื้นท่ีท่ีมีศกัยภาพด้านการท่องเที่ยวไดร้บักา

### **Print/Write Result**

In [31]:
print(OUT)

{'data': {'ITEM_ID': ['2022.3.2.0', '2022.3.2.1', '2022.3.2.2', '2022.3.2.3', '2022.3.2.4', '2022.3.2.5', '2022.3.2.6', '2022.3.2.7', '2022.3.2.8', '2022.3.2.9', '2022.3.2.10', '2022.3.2.11', '2022.3.2.12', '2022.3.2.13', '2022.3.2.14', '2022.3.2.15', '2022.3.2.16', '2022.3.2.17', '2022.3.2.18', '2022.3.2.19', '2022.3.2.20', '2022.3.2.21', '2022.3.2.22', '2022.3.2.23', '2022.3.2.24', '2022.3.2.25', '2022.3.2.26', '2022.3.2.27', '2022.3.2.28', '2022.3.2.29', '2022.3.2.30', '2022.3.2.31', '2022.3.2.32', '2022.3.2.33', '2022.3.2.34', '2022.3.2.35', '2022.3.2.36', '2022.3.2.37', '2022.3.2.38', '2022.3.2.39', '2022.3.2.40', '2022.3.2.41', '2022.3.2.42', '2022.3.2.43', '2022.3.2.44', '2022.3.2.45', '2022.3.2.46', '2022.3.2.47', '2022.3.2.48', '2022.3.2.49', '2022.3.2.50', '2022.3.2.51', '2022.3.2.52', '2022.3.2.53', '2022.3.2.54', '2022.3.2.55', '2022.3.2.56', '2022.3.2.57', '2022.3.2.58', '2022.3.2.59', '2022.3.2.60', '2022.3.2.61', '2022.3.2.62', '2022.3.2.63', '2022.3.2.64', '2022.3.2.65'

In [32]:
df = pd.DataFrame(OUT["data"], index=OUT["index"])

df.to_csv(REF_DOC + ".csv")

df

Unnamed: 0,ITEM_ID,REF_DOC,REF_PAGE_NO,MINISTRY,BUDGETARY_UNIT,CROSS_FUNC?,BUDGET_PLAN,OUTPUT,PROJECT,CATEGORY_LV1,CATEGORY_LV2,CATEGORY_LV3,CATEGORY_LV4,CATEGORY_LV5,CATEGORY_LV6,ITEM_DESCRIPTION,FISCAL_YEAR,AMOUNT,OBLIGED?
2022.3.2.0,2022.3.2.0,2022.3.2,13,กระทรวงการคลัง,สำนักงานปลดักระทรวงการคลัง,False,แผนงานบุคลากรภาครัฐ,,,งบบุคลากร,เงินเดือนและค่าจ้างประจำ,,,,,เงินเดือน,2022,9.68962e+07,False
2022.3.2.1,2022.3.2.1,2022.3.2,13,กระทรวงการคลัง,สำนักงานปลดักระทรวงการคลัง,False,แผนงานบุคลากรภาครัฐ,,,งบบุคลากร,เงินเดือนและค่าจ้างประจำ,,,,,ค่าจ้างประจำ,2022,5.4791e+06,False
2022.3.2.2,2022.3.2.2,2022.3.2,13,กระทรวงการคลัง,สำนักงานปลดักระทรวงการคลัง,False,แผนงานบุคลากรภาครัฐ,,,งบบุคลากร,,,,,,ค่าตอบแทนพนกังานราชการ,2022,1.78327e+07,False
2022.3.2.3,2022.3.2.3,2022.3.2,13,กระทรวงการคลัง,สำนักงานปลดักระทรวงการคลัง,False,แผนงานบุคลากรภาครัฐ,,,งบดำเนินงาน,ค่าตอบแทนใช้สอยและวัสดุ,,,,,ค่าเช่าบ้าน,2022,198000,False
2022.3.2.4,2022.3.2.4,2022.3.2,13,กระทรวงการคลัง,สำนักงานปลดักระทรวงการคลัง,False,แผนงานบุคลากรภาครัฐ,,,งบดำเนินงาน,ค่าตอบแทนใช้สอยและวัสดุ,,,,,ค่าตอบแทนพิเศษข้าราชการและลูกจ้างทไ่ีดร้บัเงิน...,2022,298200,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022.3.2.2283,2022.3.2.2283,2022.3.2,527,กระทรวงการพัฒนาสังคมและความมนั่คงของมนุษย์,สถาบนัพัฒนาองคก์รชมุชน (องคก์ารมหาชน),False,แผนงานยุทธศาสตรส์ร้างหลกัประกนัทางสังคม,,โครงการผูม้ีรายไดน้อ้ยในเมืองและชนบทมีความมนั่...,งบเงินอุดหนุน,เงินอุดหนุนทวั่ไป,,,,,สนบัสนุนการพัฒนาทอ่ียูอ่าศยัชุมชนรมิคลองเปรมปร...,2022,8.52e+07,False
2022.3.2.2284,2022.3.2.2284,2022.3.2,527,กระทรวงการพัฒนาสังคมและความมนั่คงของมนุษย์,สถาบนัพัฒนาองคก์รชมุชน (องคก์ารมหาชน),False,แผนงานยุทธศาสตรส์ร้างหลกัประกนัทางสังคม,,โครงการผูม้ีรายไดน้อ้ยในเมืองและชนบทมีความมนั่...,งบเงินอุดหนุน,เงินอุดหนุนทวั่ไป,,,,,สนบัสนุนการแกไ้ขปญัหาทอ่ียูอ่าศยับ้านมนั่คง,2022,3.3675e+08,False
2022.3.2.2285,2022.3.2.2285,2022.3.2,527,กระทรวงการพัฒนาสังคมและความมนั่คงของมนุษย์,สถาบนัพัฒนาองคก์รชมุชน (องคก์ารมหาชน),False,แผนงานยุทธศาสตรส์ร้างหลกัประกนัทางสังคม,,โครงการผูม้ีรายไดน้อ้ยในเมืองและชนบทมีความมนั่...,งบเงินอุดหนุน,เงินอุดหนุนทวั่ไป,,,,,สนบัสนุนบ้านพอเพยีง,2022,5.5575e+08,False
2022.3.2.2286,2022.3.2.2286,2022.3.2,529,กระทรวงการพัฒนาสังคมและความมนั่คงของมนุษย์,สถาบนัพัฒนาองคก์รชมุชน (องคก์ารมหาชน),False,แผนงานยุทธศาสตรส์ร้างหลกัประกนัทางสังคม,,โครงการพัฒนาคุณภาพชีวิตของผูม้ีรายไดน้อ้ยในเมื...,งบเงินอุดหนุน,เงินอุดหนุนทวั่ไป,,,,,สนบัสนุนพัฒนาคณุภาพชีวติของผูม้รีายไดน้อ้ยในเม...,2022,3.26556e+07,False


---
# **Clean Up!**

**Clear** temp files

In [33]:
temp.unlink()