In [3]:
#!pip install scrapy
import scrapy
from scrapy.crawler import CrawlerProcess
import logging
import string
import json
import re

In [5]:
MIN_PAGE = 81
MAX_PAGE = 181  # Pour ne pas tout faire d'un coup... il faut aller de 1 à 181 
LETTER = 'e'   # Pour attraper la base complète, il semble que deux lettres soient nécessaires (a et e)

class MaClasseScrapy(scrapy.Spider) :
    name = 'Scraping idealwine version 0.01'

    start_urls = ["https://www.idealwine.com/fr/prix-vin/" + LETTER + ".jsp?page=" + str(n) for n in range(MIN_PAGE, MAX_PAGE + 1)]

    # A priori, pas d'authentification requise :
    def parse(self, response):
        pagenum = 0
        res = re.search(r"\?page=([0-9]+)$", response.url)
        if res!=None:
            pagenum = int(res.group(1))
        logging.info("###### (parse) PAGE : " + str(pagenum) + " ;  URL : " + response.url)

        test_limit = None   # For test purpose only
        i = 0
        for r in response.css("tr:nth-of-type(n+2) td:nth-of-type(1) a") :
            i += 1
            relative_url = r.attrib["href"]
            full_url = response.urljoin(relative_url)
            yield scrapy.Request(full_url, callback=self.parse_vintage, cb_kwargs={'pagenum': pagenum})
            if (test_limit!=None) and (i>=test_limit):
                break

    def parse_vintage(self, response, pagenum, first=True):
        logging.info("###### (parse_vintage) URL : " + response.url)
        vintagedata = {'page': pagenum}

        vintagedata['nom_du_vin'] = response.css("#millesime-note strong::text").get()
        vintagedata['millesime'] = response.css("a.selected-vintage::text").get()
        vintagedata['pays_region'] = response.css(".property li:nth-of-type(1) strong::text").get()
        vintagedata['domaine'] = response.css(".hint2 strong::text").get()
        vintagedata['appellation'] = response.css(".property li:nth-of-type(2) strong::text").get()
        
        for color in response.css(".property li:contains(\"Couleur :\")"):
            vintagedata['couleur'] = color.css("strong::text").get()

        script01 = response.css("script:contains(\"canvas-courbe-cote\")").get()
        
        data01 = re.search(r'data:\s\[([0-9\"\.\,\s]+)\]', script01)
        if script01!=None:
            labels01 = re.search(r'labels:\s\[([0-9\"\,\s]+)\]', script01)
            if (data01!=None) and (labels01!=None):
                for label, data in zip(labels01.group(1).split(","), data01.group(1).split(",")):
                    if len(label)>5:
                        vintagedata['cote_'+label.replace('"', '').replace("\n", "")] = data.replace('"', '')
                       
        script02 = response.css("script:contains(\"canvas-radar-note\")").get()
        if script02!=None:
            labels02 = re.search(r'labels:\s\[([A-z0-9\"\,\.\-\s/]+)\]', script02)
            if (labels02!=None):
                for label0 in labels02.group(1).split(","):
                    label = re.search(r"^(.*)\s([0-9\,]+)/([0-9]+)$", label0.replace('"', ''))
                    if label!=None:
                        vintagedata['note_'+re.sub("[\.\-\s\n]+", "", label.group(1))] = label.group(2).replace(',', '.')

        #logging.info("Scraped = " + json.dumps(vintagedata))
        yield vintagedata

        if first:
            test_limit = None   # For test purpose only
            i = 0
            for r in response.css("a.ola") :
                i += 1
                full_url = response.urljoin(r.attrib["href"])
                yield scrapy.Request(full_url, callback = self.parse_vintage, cb_kwargs={'pagenum': pagenum, 'first': False})
                if (test_limit!=None) and (i>=test_limit):
                    break


# ATTENTION : à précéder d’un restart kernel :
def scrape_all(filepath):
    process = CrawlerProcess(settings={
        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
        'LOG_LEVEL': logging.INFO,
        'FEEDS': {filepath : {"format": "json"}},
        'AUTOTHROTTLE_ENABLED': True 
    })
    process.crawl(MaClasseScrapy)
    process.start()

import time
filename = 'idealwine_0.01_output/scrapy_idealwine_' + LETTER + '_' + str(MIN_PAGE) + 'to' + str(MAX_PAGE) + '_' + str(int(time.time())) + '.json'
scrape_all(filename)


2020-11-20 23:21:15 [scrapy.utils.log] INFO: Scrapy 2.4.0 started (bot: scrapybot)
2020-11-20 23:21:15 [scrapy.utils.log] INFO: Versions: lxml 4.5.2.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.8.3 (default, Jul  2 2020, 17:30:36) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1g  21 Apr 2020), cryptography 2.9.2, Platform Windows-10-10.0.18362-SP0
2020-11-20 23:21:15 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'LOG_LEVEL': 20,
 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
2020-11-20 23:21:15 [scrapy.extensions.telnet] INFO: Telnet Password: c4b24be83333b7fb
2020-11-20 23:21:15 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats',
 'scrapy.extensions.throttle.AutoThrottle']
2020-11-20 23:21:15 [scrapy.middleware] INF

ReactorNotRestartable: 

In [4]:
# Essai de chargement dans Pandas :
#if False:
import pandas as pd
DF_test = pd.read_json(filename)
print(DF_test.info())


DF_test.head(30)
#DF_test[DF_test.nom_du_vin=="Bourgogne Les Violettes Bizot"]



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26700 entries, 0 to 26699
Data columns (total 46 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   page            26700 non-null  int64  
 1   nom_du_vin      26700 non-null  object 
 2   millesime       26700 non-null  object 
 3   pays_region     26700 non-null  object 
 4   domaine         20410 non-null  object 
 5   appellation     26700 non-null  object 
 6   couleur         26700 non-null  object 
 7   cote_2019       23343 non-null  float64
 8   cote_2020       26700 non-null  float64
 9   cote_2014       12362 non-null  float64
 10  cote_2015       14770 non-null  float64
 11  cote_2016       16179 non-null  float64
 12  cote_2017       17848 non-null  float64
 13  cote_2018       20396 non-null  float64
 14  cote_2011       8600 non-null   float64
 15  cote_2012       9775 non-null   float64
 16  cote_2013       10816 non-null  float64
 17  cote_2007       5605 non-null  

Unnamed: 0,page,nom_du_vin,millesime,pays_region,domaine,appellation,couleur,cote_2019,cote_2020,cote_2014,...,note_iDw,note_WA,note_WS,note_Burghound,note_VinousA,note_JR,note_RVF,note_JMQ,note_DECANTER,note_Lepoint
0,81,Nuits Saint-Georges 1er Cru Aux Boudots Charle...,2009,Bourgogne,Charles Noëllat,Nuits Saint-Georges,Rouge,47.42,49.49,,...,,,,,,,,,,
1,82,Nuits Saint-Georges Les Bas de Combe Hudelot-N...,2011,Bourgogne,Hudelot-Noëllat,Nuits Saint-Georges,Rouge,49.97,51.01,37.0,...,,,,,,,,,,
2,83,Pernand-Vergelesses,2017,Bourgogne,,Pernand-Vergelesses,Rouge,15.0,15.15,,...,,,,,,,,,,
3,84,Pommard 1er Cru Grand Clos des Epenots de Courcel,2013,Bourgogne,Domaine de Courcel,Pommard,Rouge,,53.42,,...,,,,,,,,,,
4,85,Pommard La Levrière Vieilles Vignes Dugat-Py,2011,Bourgogne,Dugat-Py,Pommard,Rouge,94.73,96.7,,...,,,,,,,,,,
5,86,Pouilly-Fuissé Valette,2012,Bourgogne,Valette,Pouilly-Fuissé,Blanc,48.64,50.67,,...,,,,,,,,,,
6,87,Puligny-Montrachet 1er Cru Les Folatières Bach...,2016,Bourgogne,Bachelet-Monnot (Domaine),Puligny-Montrachet,Blanc,91.2,92.1,,...,,,,,,,,,,
7,88,Puligny-Montrachet 1er Cru Les Combettes Franç...,2017,Bourgogne,François Carillon,Puligny-Montrachet,Blanc,,79.82,,...,,,,,,,,,,
8,89,Puligny-Montrachet Benoit Ente,2017,Bourgogne,Benoit Ente,Puligny-Montrachet,Blanc,,70.0,,...,,,,,,,,,,
9,91,Rully 1er Cru Les Margotés Vincent Dureuil-Jan...,2016,Bourgogne,Vincent Dureuil-Janthial,Rully,Blanc,31.93,32.25,,...,,,,,,,,,,


In [None]:
DF_sel = DF_test[(DF_test.nom_du_vin.apply(lambda s : ("Château Talbot Caillou Blanc" in s))) & (DF_test.couleur=='Blanc')]
print(len(DF_sel))
print(DF_sel.millesime.head(30).tolist())

In [5]:
""


''