In [1]:
from pdfminer.pdfparser import PDFParser,PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal,LAParams
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
import os
if not os.path.exists(".\\outputfile"):
    os.mkdir(".\\outputfile")
    
def parse(filePath):
    try:
        fp = open(filePath, 'rb') # 以二进制读模式打开
        #用文件对象来创建一个pdf文档分析器
        praser = PDFParser(fp)
        # 创建一个PDF文档
        doc = PDFDocument()
        # 连接分析器 与文档对象
        praser.set_document(doc)
        doc.set_parser(praser)

        # 提供初始化密码
        # 如果没有密码 就创建一个空的字符串
        doc.initialize()

        # 检测文档是否提供txt转换，不提供就忽略
        if not doc.is_extractable:
            raise PDFTextExtractionNotAllowed
        else:
            # 创建PDf 资源管理器 来管理共享资源
            rsrcmgr = PDFResourceManager()
            # 创建一个PDF设备对象
            laparams = LAParams()
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            # 创建一个PDF解释器对象
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            # 循环遍历列表，每次处理一个page的内容
            firstWrite = True #用于标记是否是第一次写文件，若是则写模式为 w
            #（在之前的写模式只为a的情况下，多次测试输出的txt文件一直重复相同的内容）,若不是则写模式为 a
            for page in doc.get_pages(): # doc.get_pages() 获取page列表
                interpreter.process_page(page)
                # 接受该页面的LTPage对象
                layout = device.get_result()
                # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 
                #一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性
                outputFileName='.\\outputFile\\output--of--'+filePath.split('\\')[-1].split('.')[0]+'.txt'
                for x in layout:
                    if (isinstance(x, LTTextBoxHorizontal)):
                        results = x.get_text()
                        print(results)#输出文件文本信息
                        #保存文件为txt格式
                        if firstWrite:
                            writeMode = 'w'
                            firstWrite = False
                        else:
                            writeMode = 'a'
                        with open(outputFileName, writeMode, encoding='utf-8') as f:
                            f.write(results+'\n')
                
    except:
        trackback.print_exc()

## 测试英文文件读入

In [2]:
parse(".\\fileNeeded\\test-en.pdf")

A Brief Introduction to the Basics of Game Theory

Matthew O. Jackson, Stanford University

I provide a (very) brief introduction to game theory. I have developed these notes to

provide quick access to some of the basics of game theory; mainly as an aid for students

in courses in which I assumed familiarity with game theory but did not require it as a

prerequisite. Of course, the material discussed here is only the proverbial tip of the iceberg,

and there are many sources that oﬀer much more complete treatments of the sub ject.1 Here,

I only cover a few of the most fundamental concepts, and provide just enough discussion

to get the ideas across without discussing many issues associated with the concepts and

approaches. Fuller coverage is available through a free on-line course that can be found via
my website: http://www.stanford.edu/∼jacksonm/
The basic elements of performing a noncooperative 2 game-theoretic analysis are (1)

framing the situation in terms of the actions avail

## 测试中文文件读入

In [3]:
parse(".\\fileNeeded\\test-zh_cn.pdf")

目录 

一、广州奖概述 ................................................................................................... 2 
二、全球城市创新的主要议题 ............................................................................ 3 

三、议题、资金与动员：城市创新机制的构建 .................................................. 9 
（一）议题倡导机制 ........................................................................................ 9 
1.创新动因：为什么创新................................................................................... 10 
2.创新主体：谁来创新....................................................................................... 13 
3.创新方式：如何创新？ ................................................................................ 16 
（二）资金筹集 .............................................................................................. 20 
1.项目资金一般性来源....................................................................................... 21 
2.主要筹集渠道.............................................................................................

In [4]:
#测试os.path.join 以'/'这种符号为分隔符 用os.path.join会得到诸如'./fileNeeded\\'这种奇怪的结果
import os
os.path.join('./fileNeeded','')

'./fileNeeded\\'