From 283be36b669ee94f76ff4f69371b34dd9e154df7 Mon Sep 17 00:00:00 2001 From: Yigal Lazarev Date: Tue, 19 May 2015 00:07:03 +0300 Subject: [PATCH] Added xmltable2csv with capability to transform table-like XML files like xlsx to CSV --- README.md | 23 ++++++ samples/simple-table.xml | 30 +++++++ setup.py | 3 +- xmlutils/__init__.py | 2 +- xmlutils/console.py | 35 ++++++++- xmlutils/xmltable2csv.py | 163 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 251 insertions(+), 5 deletions(-) create mode 100644 samples/simple-table.xml create mode 100644 xmlutils/xmltable2csv.py diff --git a/README.md b/README.md index d50ee48..0d38d79 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,9 @@ ElementTree.iterparse() to iterate through nodes in an XML document, thus not needing to load the entire DOM into memory. The scripts can be used to churn through large XML files (albeit taking long :P) without memory hiccups. +Simple table-representing XMLs can be converted to CSV using xmltable2csv. It assumes each entry is encapsulated +in some tag, and successfuly tested on some XLSX files. + Blind conversion of XML to CSV and SQL is not recommended. It only works if the structure of the XML document is simple (flat). On the other hand, xml2json supports complex XML documents with multiple @@ -12,6 +15,7 @@ nested hierarchies. Lastly, the XML files are not validated at the time of conve - Kailash Nadh, June 2013 +- Yigal Lazarev, May 2015 - License: MIT License - Documentation: [http://nadh.in/code/xmlutils.py](http://nadh.in/code/xmlutils.py) - Pypi: [https://pypi.python.org/pypi/xmlutils](https://pypi.python.org/pypi/xmlutils) @@ -50,6 +54,25 @@ xml2csv --input "samples/fruits.xml" --output "samples/fruits.csv" --tag "item" --buffer The number of records to be kept in memory before it is written to the output CSV file. Helps reduce the number of disk writes. Default is 1000 ``` +##xmltable2csv +Convert an XML table to a CSV file. + +``` +xmltable2csv --input "samples/fruits.xml" --output "samples/fruits.csv" --tag "Data" +``` + +######Arguments +``` +--input Input XML table's filename* +--output Output CSV file's filename* +--tag The tag of the node that represents a single record (Eg: Data, record)* +--delimiter Delimiter for seperating items in a row. Default is , (a comma followed by a space) +--header Whether to print the header (first row of records in the XML) in the first line; 1=yes, 0=no. Default is 1. +--encoding Character encoding of the document. Default is utf-8 +--limit Limit the number of records to be processed from the document to a particular number. Default is no limit (-1) +--buffer The number of records to be kept in memory before it is written to the output CSV file. Helps reduce the number of disk writes. Default is 1000. +``` + ##xml2sql Convert an XML document to an SQL file. diff --git a/samples/simple-table.xml b/samples/simple-table.xml new file mode 100644 index 0000000..6df962b --- /dev/null +++ b/samples/simple-table.xml @@ -0,0 +1,30 @@ + + + + + + + Header 1 + + + Header 2 + + + + + Value R1C1 + + + Value R1C2 + + + + + Value R2C1 + + + Value R2C2 + + + + diff --git a/setup.py b/setup.py index 33a783e..7843d1b 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,8 @@ 'console_scripts': [ 'xml2sql = xmlutils.console:run_xml2sql', 'xml2csv = xmlutils.console:run_xml2csv', - 'xml2json = xmlutils.console:run_xml2json' + 'xml2json = xmlutils.console:run_xml2json', + 'xmltable2csv = xmlutils.console:run_xmltable2csv' ], }, classifiers=[ diff --git a/xmlutils/__init__.py b/xmlutils/__init__.py index 21be77e..1cecfc8 100644 --- a/xmlutils/__init__.py +++ b/xmlutils/__init__.py @@ -1 +1 @@ -__all__ = ['xml2sql', 'xml2csv', 'xml2json'] \ No newline at end of file +__all__ = ['xml2sql', 'xml2csv', 'xml2json', 'xmltable2csv'] \ No newline at end of file diff --git a/xmlutils/console.py b/xmlutils/console.py index fada062..37fd76f 100644 --- a/xmlutils/console.py +++ b/xmlutils/console.py @@ -2,14 +2,16 @@ Kailash Nadh, http://nadh.in June 2013 - License: MIT License - Documentation: http://nadh.in/code/xmlutils.py + License: MIT License + Documentation: http://nadh.in/code/xmlutils.py """ import argparse from xmlutils.xml2sql import xml2sql from xmlutils.xml2csv import xml2csv from xmlutils.xml2json import xml2json +from xmlutils.xmltable2csv import xmltable2csv + def run_xml2sql(): print """xml2sql by Kailash Nadh (http://nadh.in) @@ -36,7 +38,7 @@ def run_xml2sql(): num = converter.convert(tag=args.tag, table=args.table, ignore=args.ignore, limit=args.limit, packet=args.packet) print "\n\nWrote", num['num'], "records to", args.output_file, \ - " (INSERT queries =", num['num_insert'], ")" + " (INSERT queries =", num['num_insert'], ")" def run_xml2csv(): @@ -67,6 +69,33 @@ def run_xml2csv(): print "\n\nWrote", num, "records to", args.output_file +def run_xmltable2csv(): + print """xmls2csv by Yigal Lazarev (http://yig.al) + --help for help + + """ + + # parse arguments + parser = argparse.ArgumentParser(description='Convert an xml file to csv format.') + parser.add_argument('--input', dest='input_file', required=True, help='input xml filename') + parser.add_argument('--output', dest='output_file', required=True, help='output csv filename') + parser.add_argument('--tag', dest='tag', required=True, help='the record tag. eg: Data') + parser.add_argument('--delimiter', dest='delimiter', default=',', help='delimiter character. (default=,)') + parser.add_argument('--noheader', dest='noheader', action='store_true', help='exclude csv header (default=False)') + parser.add_argument('--encoding', dest='encoding', default='utf-8', help='character encoding (default=utf-8)') + parser.add_argument('--limit', type=int, dest='limit', default=-1, help='maximum number of records to process') + parser.add_argument('--buffer_size', type=int, dest='buffer_size', default='1000', + help='number of records to keep in buffer before writing to disk (default=1000)') + + args = parser.parse_args() + + converter = xmltable2csv(args.input_file, args.output_file, args.encoding) + num = converter.convert(tag=args.tag, delimiter=args.delimiter, + noheader=args.noheader, limit=args.limit, buffer_size=args.buffer_size) + + print "\n\nWrote", num, "records to", args.output_file + + def run_xml2json(): print """xml2json by Kailash Nadh (http://nadh.in) --help for help diff --git a/xmlutils/xmltable2csv.py b/xmlutils/xmltable2csv.py new file mode 100644 index 0000000..a68d9d2 --- /dev/null +++ b/xmlutils/xmltable2csv.py @@ -0,0 +1,163 @@ +""" + xml2csv.py + Kailash Nadh, http://nadh.in + October 2011 + + License: MIT License + Documentation: http://nadh.in/code/xmlutils.py +""" + +import codecs +import xml.etree.ElementTree as ETree + + +class xmltable2csv: + """ + This class is intended to convert tables formatted as XML document, to a + comma-separated value lines (CSV) file. + + This is a bit different than the xml2csv tool, which tries to convey the XML hierarchy + into a CSV file - it keeps descending to the selected tags child nodes and translates these as well. + + Example for the expected input to this converter class: + ======================================================= + + A table of the following form: + + Header 1 Header 2 + Value R1C1 Value R1C2 + Value R2C1 Value R2C2 + + Will be formatted something along the lines of the following XML in Microsoft Excel: + + + + + + + + Header 1 + + + Header 2 + + + + + Value R1C1 + + + Value R1C2 + + + + + Value R2C1 + + + Value R2C2 + + + + + + This might be a bit different in later versions, but the general form is the same. Notice that + the tags are namespaced, and this namespacing might be somewhat obfuscated, in the form of a xmlns + property in the containing 'Workbook' tag. + + This class converts simple (not tested with XLSX sheets containing formulas etc) XML-formatted tables + to csv, regardless of the specific tagging and hierarchy structure. + + Tested with some XLSX files and worked fine even for files that wouldn't convert in tools + such as dilshod's xlsx2csv. + """ + + def __init__(self, input_file, output_file, encoding='utf-8'): + """Initialize the class with the paths to the input xml file + and the output csv file + + Keyword arguments: + input_file -- input xml filename + output_file -- output csv filename + encoding -- character encoding + """ + + self.output_buffer = [] + self.output = None + + # open the xml file for iteration + self.context = ETree.iterparse(input_file, events=("start", "end")) + + # output file handle + try: + self.output = codecs.open(output_file, "w", encoding=encoding) + except: + print("Failed to open the output file") + raise + + def convert(self, tag="Data", delimiter=",", noheader=False, + limit=-1, buffer_size=1000): + + """Convert the XML table file to CSV file + + Keyword arguments: + tag -- the record tag that contains a single entry's text. eg: Data (Microsoft XLSX) + delimiter -- csv field delimiter + limit -- maximum number of records to process + buffer -- number of records to keep in buffer before writing to disk + + Returns: + number of records converted + """ + + items = [] + + depth = 0 + min_depth = 0 + row_depth = -1 + n = 0 + + # iterate through the xml + for event, elem in self.context: + if event == "start": + depth += 1 + continue + else: + depth -= 1 + if depth < min_depth: + min_depth = depth + + if depth < row_depth and items: + if noheader: + noheader = False + else: + # new line + self.output_buffer.append(items) + items = [] + # flush buffer to disk + if len(self.output_buffer) > buffer_size: + self._write_buffer(delimiter) + + plain_tag = elem.tag + last_delim = max(elem.tag.rfind('}'), elem.tag.rfind(':')) + if 0 < last_delim < len(elem.tag) - 1: + plain_tag = elem.tag[last_delim + 1:] + if tag == plain_tag: + if n == 0: + min_depth = depth + elif n == 1: + row_depth = min_depth + n += 1 + if 0 < limit < n: + break + items.append(elem.text) + + self._write_buffer(delimiter) # write rest of the buffer to file + + return n + + def _write_buffer(self, delimiter): + """Write records from buffer to the output file""" + + self.output.write('\n'.join([delimiter.join(e) for e in self.output_buffer]) + '\n') + self.output_buffer = []