Added xmltable2csv with capability to transform table-like XML files …

…like xlsx to CSV
knadh · May 18, 2015 · 283be36 · 283be36
1 parent 606ab98
commit 283be36
Show file tree

Hide file tree

Showing 6 changed files with 251 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -5,13 +5,17 @@ ElementTree.iterparse() to iterate through nodes in an XML document, thus not
 needing to load the entire DOM into memory. The scripts can be used to churn 
 through large XML files (albeit taking long :P) without memory hiccups.
 
+Simple table-representing XMLs can be converted to CSV using xmltable2csv. It assumes each entry is encapsulated
+in some tag, and successfuly tested on some XLSX files.
+
 Blind conversion of XML to CSV and SQL is not recommended.
 It only works if the structure of the XML document is simple (flat). 
 On the other hand, xml2json supports complex XML documents with multiple
 nested hierarchies. Lastly, the XML files are not validated at the time of conversion.
 
 
 - Kailash Nadh, June 2013
+- Yigal Lazarev, May 2015
 - License: MIT License
 - Documentation: [http://nadh.in/code/xmlutils.py](http://nadh.in/code/xmlutils.py)
 - Pypi: [https://pypi.python.org/pypi/xmlutils](https://pypi.python.org/pypi/xmlutils)
@@ -50,6 +54,25 @@ xml2csv --input "samples/fruits.xml" --output "samples/fruits.csv" --tag "item"
 --buffer 	The number of records to be kept in memory before it is written to the output CSV file. Helps reduce the number of disk writes. Default is 1000
 ```
 
+##xmltable2csv
+Convert an XML table to a CSV file.
+
+```
+xmltable2csv --input "samples/fruits.xml" --output "samples/fruits.csv" --tag "Data"
+```
+
+######Arguments
+```
+--input         Input XML table's filename*
+--output        Output CSV file's filename*
+--tag           The tag of the node that represents a single record (Eg: Data, record)*
+--delimiter     Delimiter for seperating items in a row. Default is , (a comma followed by a space)
+--header        Whether to print the header (first row of records in the XML) in the first line; 1=yes, 0=no. Default is 1.
+--encoding      Character encoding of the document. Default is utf-8
+--limit         Limit the number of records to be processed from the document to a particular number. Default is no limit (-1)
+--buffer        The number of records to be kept in memory before it is written to the output CSV file. Helps reduce the number of disk writes. Default is 1000.
+```
+
 ##xml2sql
 Convert an XML document to an SQL file.
 

diff --git a/samples/simple-table.xml b/samples/simple-table.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="utf-8"?>
+<?mso-application progid="Excel.Sheet"?>
+<ss:Workbook xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet">
+<ss:Table ss:ExpandedColumnCount="11" ss:FullColumns="1" ss:ExpandedRowCount="28" ss:FullRows="1">
+<ss:Row>
+<ss:Cell ss:StyleID="HeaderStyle">
+  <ss:Data ss:Type="String">Header 1</ss:Data>
+</ss:Cell>
+<ss:Cell ss:StyleID="HeaderStyle">
+  <ss:Data ss:Type="String">Header 2</ss:Data>
+</ss:Cell>
+</ss:Row>
+<ss:Row>
+<ss:Cell>
+  <ss:Data ss:Type="String">Value R1C1</ss:Data>
+</ss:Cell>
+<ss:Cell>
+  <ss:Data ss:Type="String">Value R1C2</ss:Data>
+</ss:Cell>
+</ss:Row>
+<ss:Row>
+<ss:Cell>
+  <ss:Data ss:Type="String">Value R2C1</ss:Data>
+</ss:Cell>
+<ss:Cell>
+  <ss:Data ss:Type="String">Value R2C2</ss:Data>
+</ss:Cell>
+</ss:Row>
+</ss:Table>
+</ss:Workbook>
diff --git a/setup.py b/setup.py
@@ -16,7 +16,8 @@
 		'console_scripts': [
 			'xml2sql = xmlutils.console:run_xml2sql',
 			'xml2csv = xmlutils.console:run_xml2csv',
-			'xml2json = xmlutils.console:run_xml2json'
+			'xml2json = xmlutils.console:run_xml2json',
+			'xmltable2csv = xmlutils.console:run_xmltable2csv'
 		],
 	},
 	classifiers=[

diff --git a/xmlutils/__init__.py b/xmlutils/__init__.py
@@ -1 +1 @@
-__all__ = ['xml2sql', 'xml2csv', 'xml2json']
+__all__ = ['xml2sql', 'xml2csv', 'xml2json', 'xmltable2csv']
diff --git a/xmlutils/console.py b/xmlutils/console.py
@@ -2,14 +2,16 @@
 	Kailash Nadh, http://nadh.in
 	June 2013
 	
-	License:        MIT License
-	Documentation:    http://nadh.in/code/xmlutils.py
+	License:		MIT License
+	Documentation:	http://nadh.in/code/xmlutils.py
 """
 
 import argparse
 from xmlutils.xml2sql import xml2sql
 from xmlutils.xml2csv import xml2csv
 from xmlutils.xml2json import xml2json
+from xmlutils.xmltable2csv import xmltable2csv
+
 
 def run_xml2sql():
 	print """xml2sql by Kailash Nadh (http://nadh.in)
@@ -36,7 +38,7 @@ def run_xml2sql():
 	num = converter.convert(tag=args.tag, table=args.table, ignore=args.ignore, limit=args.limit, packet=args.packet)
 
 	print "\n\nWrote", num['num'], "records to", args.output_file, \
-		  " (INSERT queries =", num['num_insert'], ")"
+		" (INSERT queries =", num['num_insert'], ")"
 
 
 def run_xml2csv():
@@ -67,6 +69,33 @@ def run_xml2csv():
 	print "\n\nWrote", num, "records to", args.output_file
 
 
+def run_xmltable2csv():
+	print """xmls2csv by Yigal Lazarev (http://yig.al)
+	--help for help
+
+	"""
+
+	# parse arguments
+	parser = argparse.ArgumentParser(description='Convert an xml file to csv format.')
+	parser.add_argument('--input', dest='input_file', required=True, help='input xml filename')
+	parser.add_argument('--output', dest='output_file', required=True, help='output csv filename')
+	parser.add_argument('--tag', dest='tag', required=True, help='the record tag. eg: Data')
+	parser.add_argument('--delimiter', dest='delimiter', default=',', help='delimiter character. (default=,)')
+	parser.add_argument('--noheader', dest='noheader', action='store_true', help='exclude csv header (default=False)')
+	parser.add_argument('--encoding', dest='encoding', default='utf-8', help='character encoding (default=utf-8)')
+	parser.add_argument('--limit', type=int, dest='limit', default=-1, help='maximum number of records to process')
+	parser.add_argument('--buffer_size', type=int, dest='buffer_size', default='1000',
+						help='number of records to keep in buffer before writing to disk (default=1000)')
+
+	args = parser.parse_args()
+
+	converter = xmltable2csv(args.input_file, args.output_file, args.encoding)
+	num = converter.convert(tag=args.tag, delimiter=args.delimiter,
+							noheader=args.noheader, limit=args.limit, buffer_size=args.buffer_size)
+
+	print "\n\nWrote", num, "records to", args.output_file
+
+
 def run_xml2json():
 	print """xml2json by Kailash Nadh (http://nadh.in)
 	--help for help

diff --git a/xmlutils/xmltable2csv.py b/xmlutils/xmltable2csv.py
@@ -0,0 +1,163 @@
+"""
+    xml2csv.py
+    Kailash Nadh, http://nadh.in
+    October 2011
+    
+    License:        MIT License
+    Documentation:    http://nadh.in/code/xmlutils.py
+"""
+
+import codecs
+import xml.etree.ElementTree as ETree
+
+
+class xmltable2csv:
+    """
+    This class is intended to convert tables formatted as XML document, to a
+    comma-separated value lines (CSV) file.
+
+    This is a bit different than the xml2csv tool, which tries to convey the XML hierarchy
+    into a CSV file - it keeps descending to the selected tags child nodes and translates these as well.
+
+    Example for the expected input to this converter class:
+    =======================================================
+
+    A table of the following form:
+
+    Header 1           Header 2
+    Value R1C1         Value R1C2
+    Value R2C1         Value R2C2
+
+    Will be formatted something along the lines of the following XML in Microsoft Excel:
+
+     <?xml version="1.0" encoding="utf-8"?>
+     <?mso-application progid="Excel.Sheet"?>
+     <ss:Workbook xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet">
+     <ss:Table ss:ExpandedColumnCount="11" ss:FullColumns="1" ss:ExpandedRowCount="28" ss:FullRows="1">
+      <ss:Row>
+        <ss:Cell ss:StyleID="HeaderStyle">
+          <ss:Data ss:Type="String">Header 1</ss:Data>
+        </ss:Cell>
+        <ss:Cell ss:StyleID="HeaderStyle">
+          <ss:Data ss:Type="String">Header 2</ss:Data>
+        </ss:Cell>
+      </ss:Row>
+      <ss:Row>
+        <ss:Cell>
+          <ss:Data ss:Type="String">Value R1C1</ss:Data>
+        </ss:Cell>
+        <ss:Cell>
+          <ss:Data ss:Type="String">Value R1C2</ss:Data>
+        </ss:Cell>
+      </ss:Row>
+      <ss:Row>
+        <ss:Cell>
+          <ss:Data ss:Type="String">Value R2C1</ss:Data>
+        </ss:Cell>
+        <ss:Cell>
+          <ss:Data ss:Type="String">Value R2C2</ss:Data>
+        </ss:Cell>
+      </ss:Row>
+     </ss:Table>
+     </ss:Workbook>
+
+    This might be a bit different in later versions, but the general form is the same. Notice that
+    the tags are namespaced, and this namespacing might be somewhat obfuscated, in the form of a xmlns
+    property in the containing 'Workbook' tag.
+
+    This class converts simple (not tested with XLSX sheets containing formulas etc) XML-formatted tables
+    to csv, regardless of the specific tagging and hierarchy structure.
+
+    Tested with some XLSX files and worked fine even for files that wouldn't convert in tools
+    such as dilshod's xlsx2csv.
+    """
+
+    def __init__(self, input_file, output_file, encoding='utf-8'):
+        """Initialize the class with the paths to the input xml file
+        and the output csv file
+
+        Keyword arguments:
+        input_file -- input xml filename
+        output_file -- output csv filename
+        encoding -- character encoding
+        """
+
+        self.output_buffer = []
+        self.output = None
+
+        # open the xml file for iteration
+        self.context = ETree.iterparse(input_file, events=("start", "end"))
+
+        # output file handle
+        try:
+            self.output = codecs.open(output_file, "w", encoding=encoding)
+        except:
+            print("Failed to open the output file")
+            raise
+
+    def convert(self, tag="Data", delimiter=",", noheader=False,
+                limit=-1, buffer_size=1000):
+
+        """Convert the XML table file to CSV file
+
+            Keyword arguments:
+            tag -- the record tag that contains a single entry's text. eg: Data (Microsoft XLSX)
+            delimiter -- csv field delimiter
+            limit -- maximum number of records to process
+            buffer -- number of records to keep in buffer before writing to disk
+
+            Returns:
+            number of records converted
+        """
+
+        items = []
+
+        depth = 0
+        min_depth = 0
+        row_depth = -1
+        n = 0
+
+        # iterate through the xml
+        for event, elem in self.context:
+            if event == "start":
+                depth += 1
+                continue
+            else:
+                depth -= 1
+                if depth < min_depth:
+                    min_depth = depth
+
+            if depth < row_depth and items:
+                if noheader:
+                    noheader = False
+                else:
+                    # new line
+                    self.output_buffer.append(items)
+                items = []
+                # flush buffer to disk
+                if len(self.output_buffer) > buffer_size:
+                    self._write_buffer(delimiter)
+
+            plain_tag = elem.tag
+            last_delim = max(elem.tag.rfind('}'), elem.tag.rfind(':'))
+            if 0 < last_delim < len(elem.tag) - 1:
+                plain_tag = elem.tag[last_delim + 1:]
+            if tag == plain_tag:
+                if n == 0:
+                    min_depth = depth
+                elif n == 1:
+                    row_depth = min_depth
+                n += 1
+                if 0 < limit < n:
+                    break
+                items.append(elem.text)
+
+        self._write_buffer(delimiter)  # write rest of the buffer to file
+
+        return n
+
+    def _write_buffer(self, delimiter):
+        """Write records from buffer to the output file"""
+
+        self.output.write('\n'.join([delimiter.join(e) for e in self.output_buffer]) + '\n')
+        self.output_buffer = []