# Parsing the Content
- https://docs.python-guide.org/scenarios/scrape/

In [None]:
import requests
from urllib.request import Request, urlopen
import json
from lxml import html
import unicodecsv as csv

In [None]:
# download the data 
url = "https://www.zillow.com/homes/m5e_rb/"
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) 
webpage = urlopen(req).read()
webpage

In [None]:
type(webpage)

### `html.fromstring()` 
- implicitly expects bytes as input

In [None]:
html.fromstring?

In [None]:
tree = html.fromstring("<p>Hello, World!</p>")
type(tree)

In [None]:
parser = html.fromstring(webpage)

The `parser` now contains the whole HTML file in a nice tree structure which we can go over two different ways: 
- XPath 
- CSSSelect

### `XPath`
`XPath` is a way of locating information in structured documents such as HTML or XML documents. 
- XPath stands for XML Path Language
- XPath uses "path like" syntax to identify and navigate nodes in an XML document

In [None]:
page = requests.get('http://econpy.pythonanywhere.com/ex/001.html')
tree = html.fromstring(page.content)

[example](http://econpy.pythonanywhere.com/ex/001.html)
After a quick analysis, we see that in our page the data is contained in two elements – one is a div with title ‘buyer-name’ and the other is a span with class ‘item-price’:

In [None]:
buyers = tree.xpath('//div[@title="item-price"]/text()')
buyers

In [None]:
# Back to parsing zillow:
search_results = parser.xpath("//div[@id='search-results']//article")
raw_json_data = parser.xpath('//script[@data-zrr-shared-data-key="mobileSearchPageStore"]//text()')

# still lots of garbages before actual listing part:
raw_json_data

In [None]:
def clean(text):
    # clean up text string
    if text:
        return ' '.join(' '.join(text).split())
    return None

cleaned_data = clean(raw_json_data).replace('<!--', "").replace("-->", "")
cleaned_data

In [None]:
json_data = json.loads(cleaned_data) # parse cleaned_data
json_data

In [None]:
# clean list of dictionary of all the sales data 
search_results = json_data.get('cat1').get('searchResults').get('listResults', [])
search_results

In [None]:
search_results[0]

In [None]:
# get the needed columns
all_data = []
for properties in search_results:
    title = properties.get('statusText')
    property_url = properties.get('detailUrl')
    
    data = {'title': title, 'url': property_url}
    all_data.append(data)

all_data