Skip to content

Commit

Permalink
Added xmltable2csv with capability to transform table-like XML files …
Browse files Browse the repository at this point in the history
…like xlsx to CSV
  • Loading branch information
egalev committed May 18, 2015
1 parent 606ab98 commit 283be36
Show file tree
Hide file tree
Showing 6 changed files with 251 additions and 5 deletions.
23 changes: 23 additions & 0 deletions README.md
Expand Up @@ -5,13 +5,17 @@ ElementTree.iterparse() to iterate through nodes in an XML document, thus not
needing to load the entire DOM into memory. The scripts can be used to churn
through large XML files (albeit taking long :P) without memory hiccups.

Simple table-representing XMLs can be converted to CSV using xmltable2csv. It assumes each entry is encapsulated
in some tag, and successfuly tested on some XLSX files.

Blind conversion of XML to CSV and SQL is not recommended.
It only works if the structure of the XML document is simple (flat).
On the other hand, xml2json supports complex XML documents with multiple
nested hierarchies. Lastly, the XML files are not validated at the time of conversion.


- Kailash Nadh, June 2013
- Yigal Lazarev, May 2015
- License: MIT License
- Documentation: [http://nadh.in/code/xmlutils.py](http://nadh.in/code/xmlutils.py)
- Pypi: [https://pypi.python.org/pypi/xmlutils](https://pypi.python.org/pypi/xmlutils)
Expand Down Expand Up @@ -50,6 +54,25 @@ xml2csv --input "samples/fruits.xml" --output "samples/fruits.csv" --tag "item"
--buffer The number of records to be kept in memory before it is written to the output CSV file. Helps reduce the number of disk writes. Default is 1000
```

##xmltable2csv
Convert an XML table to a CSV file.

```
xmltable2csv --input "samples/fruits.xml" --output "samples/fruits.csv" --tag "Data"
```

######Arguments
```
--input Input XML table's filename*
--output Output CSV file's filename*
--tag The tag of the node that represents a single record (Eg: Data, record)*
--delimiter Delimiter for seperating items in a row. Default is , (a comma followed by a space)
--header Whether to print the header (first row of records in the XML) in the first line; 1=yes, 0=no. Default is 1.
--encoding Character encoding of the document. Default is utf-8
--limit Limit the number of records to be processed from the document to a particular number. Default is no limit (-1)
--buffer The number of records to be kept in memory before it is written to the output CSV file. Helps reduce the number of disk writes. Default is 1000.
```

##xml2sql
Convert an XML document to an SQL file.

Expand Down
30 changes: 30 additions & 0 deletions samples/simple-table.xml
@@ -0,0 +1,30 @@
<?xml version="1.0" encoding="utf-8"?>
<?mso-application progid="Excel.Sheet"?>
<ss:Workbook xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet">
<ss:Table ss:ExpandedColumnCount="11" ss:FullColumns="1" ss:ExpandedRowCount="28" ss:FullRows="1">
<ss:Row>
<ss:Cell ss:StyleID="HeaderStyle">
<ss:Data ss:Type="String">Header 1</ss:Data>
</ss:Cell>
<ss:Cell ss:StyleID="HeaderStyle">
<ss:Data ss:Type="String">Header 2</ss:Data>
</ss:Cell>
</ss:Row>
<ss:Row>
<ss:Cell>
<ss:Data ss:Type="String">Value R1C1</ss:Data>
</ss:Cell>
<ss:Cell>
<ss:Data ss:Type="String">Value R1C2</ss:Data>
</ss:Cell>
</ss:Row>
<ss:Row>
<ss:Cell>
<ss:Data ss:Type="String">Value R2C1</ss:Data>
</ss:Cell>
<ss:Cell>
<ss:Data ss:Type="String">Value R2C2</ss:Data>
</ss:Cell>
</ss:Row>
</ss:Table>
</ss:Workbook>
3 changes: 2 additions & 1 deletion setup.py
Expand Up @@ -16,7 +16,8 @@
'console_scripts': [
'xml2sql = xmlutils.console:run_xml2sql',
'xml2csv = xmlutils.console:run_xml2csv',
'xml2json = xmlutils.console:run_xml2json'
'xml2json = xmlutils.console:run_xml2json',
'xmltable2csv = xmlutils.console:run_xmltable2csv'
],
},
classifiers=[
Expand Down
2 changes: 1 addition & 1 deletion xmlutils/__init__.py
@@ -1 +1 @@
__all__ = ['xml2sql', 'xml2csv', 'xml2json']
__all__ = ['xml2sql', 'xml2csv', 'xml2json', 'xmltable2csv']
35 changes: 32 additions & 3 deletions xmlutils/console.py
Expand Up @@ -2,14 +2,16 @@
Kailash Nadh, http://nadh.in
June 2013
License: MIT License
Documentation: http://nadh.in/code/xmlutils.py
License: MIT License
Documentation: http://nadh.in/code/xmlutils.py
"""

import argparse
from xmlutils.xml2sql import xml2sql
from xmlutils.xml2csv import xml2csv
from xmlutils.xml2json import xml2json
from xmlutils.xmltable2csv import xmltable2csv


def run_xml2sql():
print """xml2sql by Kailash Nadh (http://nadh.in)
Expand All @@ -36,7 +38,7 @@ def run_xml2sql():
num = converter.convert(tag=args.tag, table=args.table, ignore=args.ignore, limit=args.limit, packet=args.packet)

print "\n\nWrote", num['num'], "records to", args.output_file, \
" (INSERT queries =", num['num_insert'], ")"
" (INSERT queries =", num['num_insert'], ")"


def run_xml2csv():
Expand Down Expand Up @@ -67,6 +69,33 @@ def run_xml2csv():
print "\n\nWrote", num, "records to", args.output_file


def run_xmltable2csv():
print """xmls2csv by Yigal Lazarev (http://yig.al)
--help for help
"""

# parse arguments
parser = argparse.ArgumentParser(description='Convert an xml file to csv format.')
parser.add_argument('--input', dest='input_file', required=True, help='input xml filename')
parser.add_argument('--output', dest='output_file', required=True, help='output csv filename')
parser.add_argument('--tag', dest='tag', required=True, help='the record tag. eg: Data')
parser.add_argument('--delimiter', dest='delimiter', default=',', help='delimiter character. (default=,)')
parser.add_argument('--noheader', dest='noheader', action='store_true', help='exclude csv header (default=False)')
parser.add_argument('--encoding', dest='encoding', default='utf-8', help='character encoding (default=utf-8)')
parser.add_argument('--limit', type=int, dest='limit', default=-1, help='maximum number of records to process')
parser.add_argument('--buffer_size', type=int, dest='buffer_size', default='1000',
help='number of records to keep in buffer before writing to disk (default=1000)')

args = parser.parse_args()

converter = xmltable2csv(args.input_file, args.output_file, args.encoding)
num = converter.convert(tag=args.tag, delimiter=args.delimiter,
noheader=args.noheader, limit=args.limit, buffer_size=args.buffer_size)

print "\n\nWrote", num, "records to", args.output_file


def run_xml2json():
print """xml2json by Kailash Nadh (http://nadh.in)
--help for help
Expand Down
163 changes: 163 additions & 0 deletions xmlutils/xmltable2csv.py
@@ -0,0 +1,163 @@
"""
xml2csv.py
Kailash Nadh, http://nadh.in
October 2011
License: MIT License
Documentation: http://nadh.in/code/xmlutils.py
"""

import codecs
import xml.etree.ElementTree as ETree


class xmltable2csv:
"""
This class is intended to convert tables formatted as XML document, to a
comma-separated value lines (CSV) file.
This is a bit different than the xml2csv tool, which tries to convey the XML hierarchy
into a CSV file - it keeps descending to the selected tags child nodes and translates these as well.
Example for the expected input to this converter class:
=======================================================
A table of the following form:
Header 1 Header 2
Value R1C1 Value R1C2
Value R2C1 Value R2C2
Will be formatted something along the lines of the following XML in Microsoft Excel:
<?xml version="1.0" encoding="utf-8"?>
<?mso-application progid="Excel.Sheet"?>
<ss:Workbook xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet">
<ss:Table ss:ExpandedColumnCount="11" ss:FullColumns="1" ss:ExpandedRowCount="28" ss:FullRows="1">
<ss:Row>
<ss:Cell ss:StyleID="HeaderStyle">
<ss:Data ss:Type="String">Header 1</ss:Data>
</ss:Cell>
<ss:Cell ss:StyleID="HeaderStyle">
<ss:Data ss:Type="String">Header 2</ss:Data>
</ss:Cell>
</ss:Row>
<ss:Row>
<ss:Cell>
<ss:Data ss:Type="String">Value R1C1</ss:Data>
</ss:Cell>
<ss:Cell>
<ss:Data ss:Type="String">Value R1C2</ss:Data>
</ss:Cell>
</ss:Row>
<ss:Row>
<ss:Cell>
<ss:Data ss:Type="String">Value R2C1</ss:Data>
</ss:Cell>
<ss:Cell>
<ss:Data ss:Type="String">Value R2C2</ss:Data>
</ss:Cell>
</ss:Row>
</ss:Table>
</ss:Workbook>
This might be a bit different in later versions, but the general form is the same. Notice that
the tags are namespaced, and this namespacing might be somewhat obfuscated, in the form of a xmlns
property in the containing 'Workbook' tag.
This class converts simple (not tested with XLSX sheets containing formulas etc) XML-formatted tables
to csv, regardless of the specific tagging and hierarchy structure.
Tested with some XLSX files and worked fine even for files that wouldn't convert in tools
such as dilshod's xlsx2csv.
"""

def __init__(self, input_file, output_file, encoding='utf-8'):
"""Initialize the class with the paths to the input xml file
and the output csv file
Keyword arguments:
input_file -- input xml filename
output_file -- output csv filename
encoding -- character encoding
"""

self.output_buffer = []
self.output = None

# open the xml file for iteration
self.context = ETree.iterparse(input_file, events=("start", "end"))

# output file handle
try:
self.output = codecs.open(output_file, "w", encoding=encoding)
except:
print("Failed to open the output file")
raise

def convert(self, tag="Data", delimiter=",", noheader=False,
limit=-1, buffer_size=1000):

"""Convert the XML table file to CSV file
Keyword arguments:
tag -- the record tag that contains a single entry's text. eg: Data (Microsoft XLSX)
delimiter -- csv field delimiter
limit -- maximum number of records to process
buffer -- number of records to keep in buffer before writing to disk
Returns:
number of records converted
"""

items = []

depth = 0
min_depth = 0
row_depth = -1
n = 0

# iterate through the xml
for event, elem in self.context:
if event == "start":
depth += 1
continue
else:
depth -= 1
if depth < min_depth:
min_depth = depth

if depth < row_depth and items:
if noheader:
noheader = False
else:
# new line
self.output_buffer.append(items)
items = []
# flush buffer to disk
if len(self.output_buffer) > buffer_size:
self._write_buffer(delimiter)

plain_tag = elem.tag
last_delim = max(elem.tag.rfind('}'), elem.tag.rfind(':'))
if 0 < last_delim < len(elem.tag) - 1:
plain_tag = elem.tag[last_delim + 1:]
if tag == plain_tag:
if n == 0:
min_depth = depth
elif n == 1:
row_depth = min_depth
n += 1
if 0 < limit < n:
break
items.append(elem.text)

self._write_buffer(delimiter) # write rest of the buffer to file

return n

def _write_buffer(self, delimiter):
"""Write records from buffer to the output file"""

self.output.write('\n'.join([delimiter.join(e) for e in self.output_buffer]) + '\n')
self.output_buffer = []

0 comments on commit 283be36

Please sign in to comment.