forked from iyad-obeid/docx2txt
-
Notifications
You must be signed in to change notification settings - Fork 0
/
docx2txt.py
executable file
·172 lines (135 loc) · 5.29 KB
/
docx2txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/python
# DOCX2TXT.PY
#
# Iyad Obeid, 11/18/2014, v1.0.2
#
# Converts docx to text
# Run with -h or -help flag for more information on how to run
#
# Code is based on docx.py which is downloaded from here:
# https://github.com/mikemaccana/python-docx
#
# Installation requires
# apt-get install libxml2-dev libxslt1-dev python-dev
# (this may be required on linux, shouldn't be necessary on later
# model osx systems)
#
# sudo pip install lxml
# sudo pip install Pillow (formerly PIL)
import sys
import string
import os.path
from docx import opendocx, getdocumenttext
def main():
# initialize flags
headerFlag = True
footerFlag = True
bodyFlag = True
verboseFlag = False
helpFlag = False
nArguments = len(sys.argv)-1
# check all the input switches in order to set up process flow properly
for i in range( 1 , len(sys.argv) ):
if (sys.argv[i].lower() == '-noheader') or \
(sys.argv[i].lower() == '-nohdr') :
headerFlag = False
nArguments -= 1
elif (sys.argv[i].lower() == '-nofooter') or \
(sys.argv[i].lower() == '-noftr') :
footerFlag = False
nArguments -= 1
elif (sys.argv[i].lower() == '-verbose') or \
(sys.argv[i].lower() == '-v') :
verboseFlag = True
nArguments -= 1
elif (sys.argv[i].lower() == '-help') or \
(sys.argv[i].lower() == '-h') :
helpFlag = True
nArguments -= 1
# unknown switch
elif (sys.argv[i][0] == '-') :
print(' ')
print('ERROR: switch ' + sys.argv[i].upper() + ' not found')
print(' Try ''./docx2text.py -help'' for more options')
print(' ')
exit()
# Check to see if the minimum number of arguements (2) has been
# supplied. Note that you don't need two arguments if the help
# flag has been thrown
if (helpFlag == False) and (nArguments != 2) :
print(' ')
print('ERROR: provide input and output filenames')
print(' ')
exit()
# extract the filenames
fileNameInput = sys.argv[-2]
fileNameOutput = sys.argv[-1]
# check to see if the specified input file exists
if ( os.path.isfile(fileNameInput) == False ) :
print (' ')
print ('ERROR: input file ' + fileNameInput + ' not found')
print (' ')
exit()
# Convert the word docx to text
if helpFlag == False :
if verboseFlag == True : print(' Opening input file ' + fileNameInput)
# open the output file
newfile = open(fileNameOutput, 'w')
if headerFlag :
# read the header (if requested)
# note that there may be up to three header files
# depending on odd/even/both, so we should check all three
# to be safe. Status is true if text is found in any of them.
if verboseFlag == True : print ' Searching for header ... ',
status1 = getTheText(fileNameInput,newfile,'hdr1')
status2 = getTheText(fileNameInput,newfile,'hdr2')
status3 = getTheText(fileNameInput,newfile,'hdr3')
status = status1 or status2 or status3
if verboseFlag == True : print status
if bodyFlag :
# read the body (always requested)
if verboseFlag == True : print ' Searching for body ... ',
status = getTheText(fileNameInput,newfile,'body')
if verboseFlag == True : print status
if footerFlag :
# read the footer (if requested)
if verboseFlag == True : print ' Searching for footer ... ',
status1 = getTheText(fileNameInput,newfile,'ftr1')
status2 = getTheText(fileNameInput,newfile,'ftr2')
status3 = getTheText(fileNameInput,newfile,'ftr3')
status = status1 or status2 or status3
if verboseFlag == True : print status
newfile.close()
# if the user requests help, print the help screen
else :
print(' ')
print('DOCX2TXT.py : coverts an MS Word docx file to text')
print(' ./docx2txt.py inputfile.docx outputfile.txt')
print(' optional switches: -noheader (-nohdr), -nofooter (-noftr)')
print(' -verbose (-v), -help')
print(' ')
# end of main
def getTheText(fileNameInput,newfile,fileType):
# This is the functiont that acutally opens the respective xml file
# and reads and converts the text
status = ' found'
try :
# open the respective xml file
document = opendocx(fileNameInput,fileType)
# extract the text from the xml file
paratextlist = getdocumenttext(document)
# if any text is found, make it unicode and write it to file
if len(paratextlist) > 0 :
# Make explicit unicode version
newparatextlist = []
for paratext in paratextlist:
newparatextlist.append(paratext.encode("utf-8")+'\n')
# Write the text to file
newfile.write(''.join(newparatextlist)+'\n\n')
except :
# if the xml file isn't found
status = ' not found'
return status
# end of getTheText
if __name__ == '__main__':
main()