# DS-SF-36 | 04 | Databases and Scrapping | Codealong | Starter Code

## Part C | Scrapping and Amazon Product Reviews (cont.)

> ## We are now ready to extract the reviews offline and no longer need to query the Amazon website.

In [5]:
import os
import gzip
import json
import lxml.html
import dateutil

import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

> ## Input

In [6]:
with gzip.open(os.path.join('..', 'datasets', 'dataset-04-reviews.json.gz'), 'rb') as f:
    pages = json.loads(f.read())

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [3]:
len(pages)

NameError: name 'pages' is not defined

## First page

In [4]:
page = pages['1']['content']

NameError: name 'pages' is not defined

In [None]:
page

In [None]:
document = lxml.html.fromstring(page)

In [None]:
type(document)

(http://lxml.de/api/lxml.html-module.html#fromstring and http://lxml.de/api/lxml.html.HtmlElement-class.html)

> ## All reviews of a page

(http://lxml.de/api/lxml.etree._Element-class.html#xpath)

In [7]:
reviews = document.xpath('//*[@data-hook="review"]')

NameError: name 'document' is not defined

## First review

In [8]:
review = reviews[0]

NameError: name 'reviews' is not defined

In [9]:
type(review)

NameError: name 'review' is not defined

> ## id

(http://lxml.de/api/lxml.etree._Element-class.html#get)

In [None]:
review.get('id')

> # star rating

In [None]:
review.find('.//*[@data-hook="review-star-rating"]').get('class')

(http://lxml.de/api/lxml.etree._Element-class.html#find)

(https://en.wikipedia.org/wiki/XPath)

> ## title

In [None]:
review.findtext('.//*[@data-hook="review-title"]')

(http://lxml.de/api/lxml.etree._Element-class.html#findtext)

> ## author

In [None]:
review.findtext('.//*[@data-hook="review-author"]/*[@data-hook="review-author"]')

> ## date

In [None]:
# TODO

> ## body

In [None]:
# TODO

> ## Output

In [15]:
df = pd.DataFrame(columns = ['date', 'id', 'author', 'title', 'body', 'star_rating'])

In [16]:
df

Unnamed: 0,date,id,author,title,body,star_rating


## Putting all of it together

(https://docs.python.org/2/howto/unicode.html and https://docs.python.org/2/library/stdtypes.html)

In [17]:
def date(node):
    date = review.findtext('.//*[@data-hook="review-date"]').replace('on ', '')
    return dateutil.parser.parse(date)

def id(node):
    return node.get('id')

def author(node):
    return node.findtext('.//*[@data-hook="review-author"]/*[@data-hook="review-author"]').encode('ascii', 'ignore')

def title(node):
    return node.findtext('.//*[@data-hook="review-title"]').encode('ascii', 'ignore')

def body(node):
    return node.findtext('.//*[@data-hook="review-body"]').encode('ascii', 'ignore')

def star_rating(node):
    node = node.find('.//*[@data-hook="review-star-rating"]')

    if node == None:
        return np.nan

    for star_rating in range(1, 6):
        if node.find_class('a-star-{:d}'.format(star_rating)):
             return star_rating

    return np.nan

(http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.append.html)

In [18]:
for i in sorted(pages.keys(), key = lambda i: int(i)):
    page = pages[i]['content']
    document = lxml.html.fromstring(page)
    reviews = document.xpath('//*[@data-hook="review"]')

    for review in reviews:
        df = df.append({'date': date(review),
                        'id': id(review),
                        'author': author(review),
                        'title': title(review),
                        'body': body(review),
                        'star_rating': star_rating(review)},
                       ignore_index = True)

NameError: name 'pages' is not defined

In [19]:
df

Unnamed: 0,date,id,author,title,body,star_rating


In [20]:
df.shape

(0, 6)

In [21]:
df.to_csv(os.path.join('..', 'datasets', 'dataset-04-reviews.csv'), index = False)