<a href="https://colab.research.google.com/github/kulkarohan/tally-ai/blob/master/scraper_no_pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import json
import psycopg2 as ps
import requests

from concurrent.futures import ThreadPoolExecutor as Executor
from lxml import html
from pandas import to_datetime
from requests import Session

In [0]:
def ValuePredictor(bid, from_isbn=False):
    '''
    Takes a yelp business id, scrapes the associated site, and returns each review,
    review date, and review star rating. 
    '''
    base_url = "https://www.yelp.com/biz/"
    api_url = "/review_feed?sort_by=date_desc&start="

    class Scraper():
        def __init__(self):
            # Initialize with empty list - will append to with scraped data
            self.data = []

        def get_data(self, n, bid=bid):
            with Session() as s:
                # Makes an http get request to given url and returns response as json
                with s.get(base_url+bid+api_url+str(n*20)) as resp:
                    # Converts json response into a dictionary
                    r = dict(resp.json())
                    # Loads from dictionary
                    _html = html.fromstring(r['review_list'])

                    # 3 categories of data scraped from yelp business sites
                    dates = _html.xpath("//div[@class='review-content']/descendant::span[@class='rating-qualifier']/text()")
                    reviews = [el.text for el in _html.xpath("//div[@class='review-content']/p")]
                    ratings = _html.xpath("//div[@class='review-content']/descendant::div[@class='biz-rating__stars']/div/@title")

                    # Create a list to house 3 categories of scraped data
                    l = [dates, reviews, ratings]
                    # Transpose the list of lists so each inner list is grouped correctly
                    scraped_data = list(map(list, zip(*l)))
                    # Append data to empty list initialized during class object creation
                    for row in scraped_data:
                      self.data.append(row)

        def scrape(self):
            # multithreaded looping
            with Executor(max_workers=40) as e:
                list(e.map(self.get_data, range(10)))

    s = Scraper()
    s.scrape()
    data = s.data
    return data

In [0]:
# This will be received from Django endpoint when implemented
BUSINESS_ID = 'oqbhVgliVJH-iRa3AnD-3A'

In [0]:
# Save data returned from scraper into variable
raw_output = ValuePredictor(BUSINESS_ID)

In [0]:
def str_conversion(scraper_output):
  """
  Converts raw data returned from scraper to str type. Otherwise, cannot modify date or
  star_rating data since default data type is 'lxml.etree._ElementUnicodeResult'. 
  """
  final_output = []
  for row in scraper_output:
    final_row = list(map(str, row))
    final_output.append(final_row)
  
  # Delete original list to clear memory
  del scraper_output[:]

  return final_output

In [0]:
# Save str_output to pass into wrangle_data function
str_output = str_conversion(raw_output)

In [0]:
def wrangle_data(str_data):
  """
  Wrangles data to increase efficiency prior to storing in Postgres.
  """
  for x in str_data:
    # Clean date and convert to datetime object
    x[0] = to_datetime(x[0].replace('\n', '').replace(' ', ''))
    # Isolate star rating and convert to float
    x[2] = float(x[2].split(' ')[0])
  
  return str_data

In [0]:
# Save cleaned output to send to add_biz_id
clean_output = wrangle_data(str_output)

In [0]:
clean_output[0]

[Timestamp('2018-11-16 00:00:00'),
 "This is my favorite coffeehouse in Philadelphia. Purely for the coffee. It's down an alleyway so don't get spooked! But the inside is so vintage and cute. There's an earthy vibe with a relaxed playlist. There's never much of a wait- the service is personable. I've never been able to sit an enjoy my coffee here- usually the place is packed. I guess that would be my own recommendation to change is more seating. If you're looking to sit down and work in a low key environment- this is a little more bustle. But the place is a must visit to at least try the coffee!",
 5.0]

In [0]:
def add_biz_id(data, biz_id):
  """
  Business ID retrieved from Django endpoint will be appended to each row of data
  associated with the business.
  """
  for row in data:
    row.append(biz_id)
  
  return data

In [0]:
final_output = add_biz_id(clean_output, BUSINESS_ID)

In [0]:
final_output

[[Timestamp('2018-11-16 00:00:00'),
  "This is my favorite coffeehouse in Philadelphia. Purely for the coffee. It's down an alleyway so don't get spooked! But the inside is so vintage and cute. There's an earthy vibe with a relaxed playlist. There's never much of a wait- the service is personable. I've never been able to sit an enjoy my coffee here- usually the place is packed. I guess that would be my own recommendation to change is more seating. If you're looking to sit down and work in a low key environment- this is a little more bustle. But the place is a must visit to at least try the coffee!",
  5.0,
  'oqbhVgliVJH-iRa3AnD-3A'],
 [Timestamp('2018-11-16 00:00:00'),
  'Fancy coffee. The atmosphere here is really earthy and open. Plenty of outlets and a lot of turn over with seating. Picnic tables and loungey chairs. They also have hand poured coffee with exotic beans that are around $5 a cup but pretty tasty. All the staff are really friendly too. Interesting spot and love the conc