## Extract Information from Documents

#### Part 1: Extract information from Microsoft Word Documents

In [1]:
from docx import Document

def extract_text_from_docx(docx_path):
    # Load the Word document
    doc = Document(docx_path)
    
    # List to hold all the text in the document
    full_text = []

    # Iterate over each paragraph in the document
    for para in doc.paragraphs:
        full_text.append(para.text)

    # Join all the text separated by a newline
    return '\n'.join(full_text)

def extract_tables_from_docx(docx_path):
    # Load the Word document
    doc = Document(docx_path)
    
    # List to hold all tables data
    tables_data = []

    # Iterate over each table in the document
    for table in doc.tables:
        table_text = []
        for row in table.rows:
            row_data = []
            for cell in row.cells:
                row_data.append(cell.text.strip())
            table_text.append(row_data)
        tables_data.append(table_text)
    
    return tables_data

from xml.etree import ElementTree as ET
def extract_hyperlinks(docx_path):
    doc = Document(docx_path)
    hyperlinks = []

    # Define the WordprocessingML namespace
    w_namespace = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    r_namespace = '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}'

    # Helper function to process hyperlink elements
    def process_hyperlink(hyperlink_el, rels):
        r_id = hyperlink_el.get(r_namespace + 'id')
        if r_id and r_id in rels:
            link = rels[r_id]._target
            text = ''.join(node.text for node in hyperlink_el if node.tag == w_namespace + 't')
            hyperlinks.append({'text': text, 'url': link})

    # Get the relationships dictionary
    rels = doc.part.rels

    # Iterate through the document elements to find hyperlinks
    for el in doc.element.body.iter():
        if el.tag == w_namespace + 'hyperlink':
            process_hyperlink(el, rels)

    return hyperlinks


inP = "C:\\EconS524\\EconS524 Syllabus.docx"
text = extract_text_from_docx(inP)
print(text)

tables = extract_tables_from_docx(inP)

# Print all extracted table data
for i, table in enumerate(tables, start=1):
    print(f"Table {i}:")
    for row in table:
        print(row)

hyperlinks = extract_hyperlinks(inP)

for hyperlink in hyperlinks:
    print(f"Text: {hyperlink['text']}, URL: {hyperlink['url']}")


EconS 524, Spring 2023
Applied Machine Learning for Economics
Time: Mon. and Wed. 4:10pm-5:25pm
Location: Hulbert 23
Credit 3, Course Prerequisites: EconS 525, 526 

Jia Yan
Office Hours (Hulbert 301E): Tuesday 2:00pm – 4:00pm
E-mail: jiay@wsu.edu

Class Website: canvas
You will have to enter your WSU username and password to access the course materials.

Class Description
This course gives an overview of basic concepts and algorithms in machine learning and their connections to econometrics. The course will first review the following concepts and modeling techniques: linear and non-linear regressions, non-parametrics, support vector machine, random forests, supervised and non-supervised learning, deep learning, and natural language processing, and then discuss the applications of these techniques in economics.

Readings:

Required Text book: T. Hastie, R. Tibshirani and J. Friedman, , 
 
QuantEcon Data Science 

Course Objectives: This course is designed for Master and PhD students wi

#### Part 2: Extract from PDF

In [2]:
import tabula
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument

def extract_text_from_pdf(path):
    '''
    This script defines a function extract_text_from_pdf which:

    1. Opens a PDF file in binary read mode.
    2. Creates a PDF parser for the file and a PDF document object from the parser.
    3. Initializes a resource manager and a text converter with layout parameters.
    4. Sets up a PDF page interpreter.
    5. Iterates over each page in the PDF document and processes it to extract the text.
    6. Captures the extracted text into a StringIO object.
    7. Retrieves the value from the StringIO object, closes it, and then returns the text.
    '''
    output_string = StringIO()
    with open(path, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)

    text = output_string.getvalue()
    output_string.close()
    return text
   
def extract_tables_from_pdf(path):
    '''
    You need to have Java installed on your system to use Tabula. 
    '''
    df = tabula.read_pdf(path, pages='all', multiple_tables=True)
    return df 

inP = "C:\\EconS524\\EconS524 Syllabus.pdf"
pdf_text = extract_text_from_pdf(inP)
print(pdf_text)

tables =  extract_tables_from_pdf(inP)
for i, table in enumerate(tables):
    print(f"Table {i}:")
    print(table)
    # You can also export to CSV, Excel, etc.
    # table.to_csv(f"table_{i}.csv")


  from pandas.core.computation.check import NUMEXPR_INSTALLED


EconS 524, Spring 2023 

Applied Machine Learning for Economics 
Time: Mon. and Wed. 4:10pm-5:25pm 

Location: Hulbert 23 

Credit 3, Course Prerequisites: EconS 525, 526  

4.  Implement the models on the data using Python 
5.  Write up the findings as a short research paper (no more than 20 pages) 

approaches 

  

 
Jia Yan 
Office Hours (Hulbert 301E): Tuesday 2:00pm – 4:00pm 
E-mail: jiay@wsu.edu 
 
Class Website: canvas 
You will have to enter your WSU username and password to access the course materials. 
 
Class Description 
This course gives an overview of basic concepts and algorithms in machine learning and 
their connections to econometrics. The course will first review the following concepts and 
modeling techniques: linear and non-linear regressions, non-parametrics, support vector 
machine,  random  forests,  supervised  and  non-supervised  learning,  deep  learning,  and 
natural  language  processing,  and  then  discuss  the  applications  of  these  techniques  in 

Got stderr: Jan 16, 2024 8:38:50 PM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider loadDiskCache
Jan 16, 2024 8:38:50 PM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider <init>
Jan 16, 2024 8:38:51 PM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider <init>



Table 0:
    Week                                               Plan
0      1  Introduction to statistical learning and Pytho...
1    2-7  Supervised Learning\r1.\rLinear Methods for Re...
2      8  Un-supervised learning\r1.\rClustering\r2.Dime...
3   9-10     Deep learning: introduction to neural networks
4     11       Natural language processing (NLP) techniques
5  12-13  Introduction to the following techniques: resa...
6     14  Applications in economics: prediction, classif...


### Part 3: Extract from image

In [35]:
import cv2
import pytesseract
import platform
from PIL import Image, ImageFilter, ImageEnhance

def image_to_text(image_file_path, openCV=True):
    if platform.system() == "Windows":
        # this sentence should be platform independent; under winsdows need the following command
        pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
    
    if openCV:
        # load image
        image = cv2.imread(image_file_path)

        # Convert the image to grayscale
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # Apply adaptive thresholding to create a binary image
        _,thresh1 = cv2.threshold(image,120,255,cv2.THRESH_BINARY)

        # Perform OCR using Tesseract
        text = pytesseract.image_to_string(thresh1, lang='eng')
    else:
        # Load the image from the specified path
        img = Image.open(image_file_path)
    
        # Convert the image to grayscale to improve OCR accuracy
        img = img.convert('L')

        # Enhance the image contrast
        enhancer = ImageEnhance.Contrast(img)
        img = enhancer.enhance(2)
    
        # Apply a threshold filter to make the image binary
        img = img.point(lambda x: 0 if x < 128 else 255, '1')
    
        # Optional: resize the image to double its size, to make it easier for Tesseract to read
        img = img.resize([2 * s for s in img.size], Image.LANCZOS)   
        
        # Perform OCR on the grayscale image
        text = pytesseract.image_to_string(img, lang='eng')
    return text

inP = "C:\\EconS524\\hand_writing.jpg"
text = image_to_text(inP, openCV='False')
print(text)

WHitg fim Alive Tle mage
Tiny CHANGES To CARTH



#### Part 4: Extract PDF Image

In [None]:
class PdfOcrExtract(object): 
    
    def __init__(self, inpath, recycle=None, poppler_path=None, image_type="png", image_dpi=300, size=None,
                 paddle=True):
        self.inpath = inpath
        self.dpi = image_dpi
        self.image_type = image_type
        self.poppler_path = poppler_path   
        self.size = size         
        self.paddle = paddle
        if recycle is None:
            # recycle is a temporary directory to store temporary image files
            tmp = os.path.join(inpath, "recycle")
            if os.path.isdir(tmp):
                shutil.rmtree(tmp)
            self.recycle = tmp
            os.mkdir(tmp)
        else:
            self.recycle = recycle

    def empty_folder_images(self):
        '''
        Returns
        -------
        None.
        Empty all image files from the recycle directory
        '''
        try:
            filelist = [ f for f in os.listdir(self.recycle) if f.endswith(self.image_type) ]
            for f in filelist:
                os.remove(os.path.join(self.recycle, f))
        except:
            pass
    
    def images_by_page(self, path):
        convert_from_path(path, fmt=self.image_type, 
                                  dpi=self.dpi, output_folder=self.recycle, use_cropbox=False,
                                  poppler_path=self.poppler_path, size=self.size)
    
    def image_to_text(self, image_file):
        import platform
        if platform.system() == "Windows":
            # this sentence should be platform independent; under winsdows need the following command
            pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
    
        # load image
        image = cv2.imread(os.path.join(self.recycle, image_file))
    
        # convert the image to black and white for better OCR
        ret,thresh1 = cv2.threshold(image,120,255,cv2.THRESH_BINARY)
    
        # pytesseract image to string to get results
        text = pytesseract.image_to_string(thresh1, lang='eng') 
        text = text.strip().replace("\n", "")
        return text
    
    def batch_text_extract(self):
        filelist = [ f for f in os.listdir(self.recycle) if f.endswith(self.image_type) ]
        filelist.sort()
        x = list(map(self.image_to_text, filelist))
        return "".join(item for item in x)

    def text_extraction_file(self, file):
        # initialize the folder of storing intermediate image files 
        self.empty_folder_images()
        try:
            # convert each page of a file to an image file
            self.images_by_page(file)
            # extract text from all the images
            contents = self.batch_text_extract().replace(" ", "")
        except Exception as e:
            print('Exception',e)
            contents = ""
        return contents
 
