# Parse and Encoding
In this notebook we will learn how to parse the text with specific format and encoding the text in specific format.

### CSV

#### Read CSV from string
Due the limitation of jupiter notebook, we use multi-lines string as input

In [53]:
import csv
s = '''
Symbol,Price,Date,Time,Change,Volume    
"AA",39.48,"6/11/2007","9:36am",-0.18,181800    
"AIG",71.38,"6/11/2007","9:36am",-0.15,195500    
"AXP",62.58,"6/11/2007","9:36am",-0.46,935000    
"BA",98.31,"6/11/2007","9:36am",+0.12,104800    
"C",53.08,"6/11/2007","9:36am",-0.25,360900    
"CAT",78.29,"6/11/2007","9:36am",-0.23,225400
'''
f_csv = csv.reader((line for line in s.splitlines() if line.strip() != ''), delimiter= ',')
headers = None
rows = []
col_types = [str, float, str, str, float, int]
for row in f_csv:
    if headers == None:
        headers = [column.strip() for column in row] 
    else :
        row = tuple(convert(value) for convert, value in zip(col_types, row))
        rows.append(row)
print("headers =", headers)
print("rows =", rows)

headers = ['Symbol', 'Price', 'Date', 'Time', 'Change', 'Volume']
rows = [('AA', 39.48, '6/11/2007', '9:36am', -0.18, 181800), ('AIG', 71.38, '6/11/2007', '9:36am', -0.15, 195500), ('AXP', 62.58, '6/11/2007', '9:36am', -0.46, 935000), ('BA', 98.31, '6/11/2007', '9:36am', 0.12, 104800), ('C', 53.08, '6/11/2007', '9:36am', -0.25, 360900), ('CAT', 78.29, '6/11/2007', '9:36am', -0.23, 225400)]


#### Observation
1. A csv line can be empty, so we need to filter out empty line.
2. We can customize delimiters.
3. By default all the columns are treated as string and yoiu may have space before and after, so you need to trim (strip()) it.
4. You can convert data with predefined types.
5. header is considered as a list of element, rows can be treated as a list of tuple (for performance).

#### Write CSV data into String

In [44]:
headers = ['Symbol', 'Price', 'Date', 'Time', 'Change', 'Volume']
rows = [
    ('AA', 39.48, '6/11/2007', '9:36am', -0.18, 181800), 
    ('AIG', 71.38, '6/11/2007', '9:36am', -0.15, 195500), 
    ('AXP', 62.58, '6/11/2007', '9:36am', -0.46, 935000), 
    ('BA', 98.31, '6/11/2007', '9:36am', 0.12, 104800), 
    ('C', 53.08, '6/11/2007', '9:36am', -0.25, 360900), 
    ('CAT', 78.29, '6/11/2007', '9:36am', -0.23, 225400)
]
import csv
import io
output = io.StringIO()
writer = csv.writer(output, quoting=csv.QUOTE_NONNUMERIC)
writer.writerow(headers)
for row in rows:
    writer.writerow(row)
print(output.getvalue())

"Symbol","Price","Date","Time","Change","Volume"
"AA",39.48,"6/11/2007","9:36am",-0.18,181800
"AIG",71.38,"6/11/2007","9:36am",-0.15,195500
"AXP",62.58,"6/11/2007","9:36am",-0.46,935000
"BA",98.31,"6/11/2007","9:36am",0.12,104800
"C",53.08,"6/11/2007","9:36am",-0.25,360900
"CAT",78.29,"6/11/2007","9:36am",-0.23,225400



### Json

#### Load a JSON string

In [56]:
import json
s = '{"name": "ACME", "shares":50, "price":490.1}'
from collections import OrderedDict
data = json.loads(s, object_pairs_hook=OrderedDict)
print(data)

OrderedDict([('name', 'ACME'), ('shares', 50), ('price', 490.1)])

#### Dump JSON data

In [60]:
import json
data = {
    'name' : 'ACME',
    'shares' : 100,
    'price' : 542.23
}
json_str = json.dumps(data)
print('json_str = ', json_str)

json_str =  {"name": "ACME", "shares": 100, "price": 542.23} <class 'str'>


### XML

#### Parse XML

In [79]:
import xml.etree.ElementTree as ET

# XML string
xml_string = '''
    <?xml version="1.0"?>    
    <rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">    
        <channel>      
            <title>Planet Python</title>      
            <link>http://planet.python.org/</link>      
            <language>en</language>      
            <description>Planet Python - http://planet.python.org/</description>      
            <item>        
                <title>Steve Holden: Python for Data Analysis</title>          
                <guid>http://holdenweb.blogspot.com/...-data-analysis.html</guid>          
                <link>http://holdenweb.blogspot.com/...-data-analysis.html</link>          
                <description>...</description>          
                <pubDate>Mon, 19 Nov 2012 02:13:51 +0000</pubDate>      
            </item>      
            <item>
                <title>Vasudev Ram: The Python Data model (for v2 and v3)</title>        
                <guid>http://jugad2.blogspot.com/...-data-model.html</guid>        
                <link>http://jugad2.blogspot.com/...-data-model.html</link>        
                <description>...</description>        
                <pubDate>Sun, 18 Nov 2012 22:06:47 +0000</pubDate>        
            </item>      
            <item>        
                <title>Python Diary: Been playing around with Object Databases</title>        
                <guid>http://www.pythondiary.com/...-object-databases.html</guid>        
                <link>http://www.pythondiary.com/...-object-databases.html</link>        
                <description>...</description>        
                <pubDate>Sun, 18 Nov 2012 20:40:29 +0000</pubDate>      
            </item>
        </channel>    
    </rss>
'''

# Parse the XML string
start_index = xml_string.find('?>')
xml_content = xml_string[start_index + 2 if start_index >= 0 else 0:]
root = ET.fromstring(xml_content)

rows = []
for item in root.iterfind('channel/item'):    
    title = item.findtext('title')    
    date = item.findtext('pubDate')    
    link = item.findtext('link')
    rows.append({'title': title, 'date' : date, 'link' : link})
print(rows)

[{'title': 'Steve Holden: Python for Data Analysis', 'date': 'Mon, 19 Nov 2012 02:13:51 +0000', 'link': 'http://holdenweb.blogspot.com/...-data-analysis.html'}, {'title': 'Vasudev Ram: The Python Data model (for v2 and v3)', 'date': 'Sun, 18 Nov 2012 22:06:47 +0000', 'link': 'http://jugad2.blogspot.com/...-data-model.html'}, {'title': 'Python Diary: Been playing around with Object Databases', 'date': 'Sun, 18 Nov 2012 20:40:29 +0000', 'link': 'http://www.pythondiary.com/...-object-databases.html'}]


#### Turning Dictionary into XML

In [99]:
from xml.etree.ElementTree import Element, tostring
def dict_to_xml(tag : str, d :dict) -> Element: 
    '''
    Turn a simple dict of key/value pairs into XML
    '''
    elem = Element(tag)
    for key, val in d.items():
        child = Element(key)
        child.text = str(val)
        elem.append(child)
    return elem

s = {'name' : 'GOOG', 'shares':100, 'prices':490.1}
e = dict_to_xml('stock', s)
xml_string = tostring(e, encoding='utf-8', method='xml')
print(xml_string.decode('utf-8'))


<stock><name>GOOG</name><shares>100</shares><prices>490.1</prices></stock>


#### Modifying XML

In [103]:
from xml.etree.ElementTree import parse, Element, fromstring
xml_string = '''
<?xml version="1.0"?>
<stop>
    <id>14791</id>
    <nm>Clark &amp; Balmoral</nm>
    <sri>
        <rt>22</rt>
        <d>North Bound</d>
        <dd>North Bound</dd>
    </sri>
    <cr>22</cr>
    <pre>
        <pt>5 MIN</pt>
        <fd>Howard</fd>
        <v>1378</v>
        <rn>22</rn>
    </pre>
    <pre>
        <pt>15 MIN</pt>
        <fd>Howard</fd>
        <v>1867</v>
        <rn>22</rn>
    </pre>
</stop>
'''
start_index = xml_string.find('?>')
xml_content = xml_string[start_index + 2 if start_index >= 0 else 0:]
print('xml_content =', xml_content)
root = fromstring(xml_content)
root.remove(root.find('sri'))
root.remove(root.find('cr'))
e = Element('spam')
e.text = 'This is a test'
root.insert(2, e)
xml_string = tostring(root, encoding='utf-8', method='xml')
print('xml_string = ', xml_string.decode('utf-8'))



xml_content = 
<stop>
    <id>14791</id>
    <nm>Clark &amp; Balmoral</nm>
    <sri>
        <rt>22</rt>
        <d>North Bound</d>
        <dd>North Bound</dd>
    </sri>
    <cr>22</cr>
    <pre>
        <pt>5 MIN</pt>
        <fd>Howard</fd>
        <v>1378</v>
        <rn>22</rn>
    </pre>
    <pre>
        <pt>15 MIN</pt>
        <fd>Howard</fd>
        <v>1867</v>
        <rn>22</rn>
    </pre>
</stop>

xml_string =  <stop>
    <id>14791</id>
    <nm>Clark &amp; Balmoral</nm>
    <spam>This is a test</spam><pre>
        <pt>5 MIN</pt>
        <fd>Howard</fd>
        <v>1378</v>
        <rn>22</rn>
    </pre>
    <pre>
        <pt>15 MIN</pt>
        <fd>Howard</fd>
        <v>1867</v>
        <rn>22</rn>
    </pre>
</stop>
