In [21]:
from __future__ import print_function

import time
import requests
from bs4 import BeautifulSoup
import pymongo
import pandas as pd
import numpy as np
import re
import unicodedata

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wnlzer = WordNetLemmatizer()

from pprint import pprint
from time import time
import logging

import matplotlib.pyplot as plt

In [53]:
def list_o_strains(i):
    LOS = []
    r = requests.get(i)
    soup2 = BeautifulSoup(r.content, 'html.parser')
    strains = soup2.find_all('a', class_="ga_Explore_Strain_Tile")
    for s in strains:
        LOS.append(str(s.attrs['href'])[1:])
    return LOS

In [23]:
def get_reviews2(l):
    """Pass in a list of URL's and return them in a mongo db table as a dicitonary with 
    {'url', 'html'} and their corresponding values"""
    r = requests.get(l)
    html = (r.content)
    return html

In [32]:
def star_int_conv(s):
    star = (int((s[6:].split(';')[0]).strip('px'))/22)
    return star

In [62]:
def parse_docs2(d):
    """Parse the HTML docs that we have stored in a dictionary, return as a list.
    Also scrape and parse star rating for each review """
    
    strain_text= []
    star_rate = []
    user_name = []
    strain_description = []
    strain_flavors = []
    strain_effects = []
    effects = []
    negs = []
    meds = []
    
    soup = BeautifulSoup(d, 'html.parser')
    
    #get the strain review text
    revs = soup.find_all('p',class_='strain-review__text') 
    for r in revs:
        text = r.text
        remove_punch = re.sub('[^A-Za-z ]' , "" ,text )
        token = remove_punch.lower().split()
        srm_token = [wnlzer.lemmatize(i) for i in token if not i in set(stopwords.words('english'))]
        clean_text = " ".join(srm_token)
        strain_text.append(clean_text)
        
    #get the star ratings from the strains
    tags = soup.select("div.div.stars")
    for t in tags:
        star = t.attrs['style']
        star_rate.append(star_int_conv(star))
        
    #get the user names from 
    users = soup.find_all('div', class_='strain-review__title')
    no_add = ['damsmith', 'Suit', 'BluPrizm']
    for u in users:
        temp = u.find('h2')
        if temp.text not in no_add:
            user_name.append(temp.text)
    
    #get descriptions of the strains
    desc = soup.find_all('div', itemprop='description') 
    for d in desc:
        text = d.text
        remove_n = re.sub('\n' , " " ,text )
        norm_text = unicodedata.normalize("NFKD", remove_n)
        strain_description.append(norm_text)
    
    #create list of all the effects
    effect = soup.find_all('div', attrs={'class':'histogram-label'})
    for e in effect:
        strain_effects.append(e.text)
        
    # create smaller lists of good-effects, bad-effects, and medical uses
    for i in strain_effects[:5]:
        effects.append(i)
    for i in strain_effects[10:]:
        negs.append(i)
    for i in strain_effects[5:10]:
        meds.append(i)
    
    #Create list of each strain flavor
    flavors = soup.find_all('div', class_='flavor-name')
    for f in flavors:
        strain_flavors.append(f.text[3:])
        
    return strain_description, strain_flavors, effects, meds, negs, user_name, star_rate, strain_text 

In [40]:
def strain_dict_entry(strain, stype, desc, flavors, effects, meds, negs, user_id, userstars, userreview, straindict):
    if straindict is None:
        straindict = {}
        
    if strain not in straindict:
        straindict[strain] = {
            "stype" : stype,
            "general_description" : desc,
            "flavor" : flavors,
            'effects': effects,
            'medical_uses': meds,
            'negatives': negs,
            "user_reviews" : [] 
        }
        
    straindict[strain]['user_reviews'].append({'user':user_id,
                                         'stars':userstars,
                                         'review':userreview
                                        })
    return straindict

In [41]:
def scraper_dummy1(los):
    """pass in a list of strains to be scraped from Leafly.
    returns dictionary keyed by strains w/strain info(reviews) as values"""
    cnt = 0
    strain_d = None
    for s in los:
        strain = s 
        st_strain = s[7:]
        stype = s[:6]
        url = "https://www.leafly.com/{}".format(strain)
        d = get_reviews2(url)
        desc, flavors, effects, meds, negs, users, star_ratings, reviews = parse_docs2(d)
        if len(desc) == 0:
            break
            
        strain_d = strain_dict_entry(st_strain, stype, desc, flavors, effects, meds, 
                                     negs, users, star_ratings, reviews, strain_d)
        if cnt % 10 == 0:
            print(cnt)
        cnt +=1
            
    return strain_d 

In [54]:
los = list_o_strains('https://www.leafly.com/explore')

In [58]:
los[7][:6]
l = los[8:]
l.remove("hybrid/skywalker-og")
l

47

In [59]:
test_los = l[:3]

In [60]:
test_los

['hybrid/blue-dream', 'sativa/sour-diesel', 'hybrid/gsc']

In [61]:
scraper_dummy1(test_los)

0


{'blue-dream': {'stype': 'hybrid',
  'general_description': ['Blue Dream, a sativa-dominant hybrid originating in California, has achieved legendary status among West Coast strains. Crossing a Blueberry indica with the sativa Haze, Blue Dream balances full-body relaxation with gentle cerebral invigoration. Novice and veteran consumers alike enjoy the level effects of Blue Dream, which ease you gently into a calm euphoria. Some Blue Dream phenotypes express a more indica-like look and feel, but the sativa-leaning variety remains most prevalent. With a sweet berry aroma redolent of its Blueberry parent, Blue Dream delivers swift symptom relief without heavy sedative effects. This makes Blue Dream a popular daytime medicine for patients treating pain, depression, nausea, and other ailments requiring a high THC strain.  '],
  'flavor': ['Blueberry', 'Berry', 'Sweet'],
  'effects': ['Happy', 'Relaxed', 'Euphoric', 'Uplifted', 'Creative'],
  'medical_uses': ['Stress', 'Depression', 'Pain', '