In [1]:
# In this notebook, set up the utilities to scrap images about climats on vivino

from explicit import waiter, XPATH
from selenium import webdriver
from selenium.webdriver.support import ui
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as expect
from selenium.webdriver.common.keys import Keys

import requests
import time 
import re
import os
import urllib3
from bs4 import BeautifulSoup

urllib3.disable_warnings()

In [14]:
def get_url_from_selenium(wines_list, url_list):
    """
    uses selenium to get a clean url
    gets the urls by browsing vivino.
    
    args: 
    - wines_list : a list of climats that will be searched on the website.
    - url_list : a list of urls that will be completed. 
    
    returns nothing
    """
    # initialize the list of urls
    # url_list = []
    
    # driver : chrome
    driver = webdriver.Chrome()
    
    for (i,wine) in enumerate(wines_list):
        driver.get("https://www.vivino.com/") # go on the website
        elem = driver.find_element_by_xpath("//input[@placeholder='Chercher un vin']") # hit the search box
        elem.send_keys(wine) # search for the wine
        elem.send_keys(Keys.RETURN) # press enter
        url = driver.current_url # get the url of the search results page
        url_list.append(url) # add this url 
        time.sleep(1) # wait for 1 second
        if i%10 == 0:
            time.sleep(10) # pause for 10 seconds every ten requests to avoid ban
            print("%s / %s names completed "%(i, len(wines_list)))
    driver.quit() # once terminated, close the driver.
    
    return None

In [15]:
def make_soup(url):
    """
    small helper to soup a url with urllib3 and bs4
    
    args : 
    url : a string 
    
    returns a Bs object
    """
    http = urllib3.PoolManager()
    r = http.request("GET", url)
    return BeautifulSoup(r.data,'lxml')

def parse_images(url_list, images_url):
    """
    parses the webpage and returns a list of url of the desired images
    different function because different parsing method.
    
    args : 
    url_list:  a list of urls
    images_url : the list of images url that will be fulfilled
    
    returns nothing
    """
    
    # images_url = []
    for url in url_list:
        src = make_soup(url)
        res =src.find_all('figure',attrs={"class" : "wine-card__image"})# get the matching part of the html
        if len(res) > 0: # if there are results : take the first one (since it is the most relevant)
            res_str = str(res[0])
            image_url = res_str[res_str.find("(")+1:res_str.find(")")]
            images_url.append('http:'+image_url) # add http:// 
        else: # append a "no image available" if no results
            images_url.append("http://jardinot.org/wp-content/uploads/2016/04/visuel-non-disponible.jpg") 
    return None

In [16]:
def export_images_to_folder(images_url):
    """
    exports images in a folder (already created)
    
    args :
    images_url : a list of urls
    
    returns None
    """
    images = []

    for url, wine in zip(images_url, wines_list):
        img_data = requests.get(url).content
        with open(str(wine)+".png", 'wb') as handler: # saved as png - Stackoverflow seems to prefer it over jpg.
            handler.write(img_data)
    print("download complete.")
    return None

In [6]:
# set up the working directory
os.chdir('/Users/Gabriel/Documents/Github/Projet_informatique_ENSAE/cartes/scripts')

f = open("keywords_to_scrap.txt", 'r')   # open the file

for line in f:
    wines_list=line.split('\t')

In [22]:
url_list = []
get_url_from_selenium(wines_list, url_list) # browse vivino

0 / 580 names completed 
10 / 580 names completed 
20 / 580 names completed 
30 / 580 names completed 
40 / 580 names completed 
50 / 580 names completed 
60 / 580 names completed 
70 / 580 names completed 
80 / 580 names completed 
90 / 580 names completed 
100 / 580 names completed 
110 / 580 names completed 
120 / 580 names completed 
130 / 580 names completed 
140 / 580 names completed 
150 / 580 names completed 
160 / 580 names completed 
170 / 580 names completed 
180 / 580 names completed 
190 / 580 names completed 
200 / 580 names completed 
210 / 580 names completed 
220 / 580 names completed 
230 / 580 names completed 
240 / 580 names completed 
250 / 580 names completed 
260 / 580 names completed 
270 / 580 names completed 
280 / 580 names completed 
290 / 580 names completed 
300 / 580 names completed 
310 / 580 names completed 
320 / 580 names completed 
330 / 580 names completed 
340 / 580 names completed 
350 / 580 names completed 
360 / 580 names completed 
370 / 580 na

In [23]:
images_url = []
parse_images(url_list, images_url) # get the links of the images

In [24]:
# save the images
# make sure that the directory "wines" already exists. 
os.chdir('/Users/Gabriel/Documents/Github/Projet_informatique_ENSAE/img/wines')
export_images_to_folder(images_url)

download complete.
