In [1]:
#!pip install scrapy
import scrapy
from scrapy.crawler import CrawlerProcess
import logging
import string
import re

In [2]:
def None2empty(s):
  return s if (s!=None) else ""  

MEMBER_LOGIN = "francois@fguerillon.net"
MEMBER_PASSWORD = ""  # A RENSEIGNER

LETTERS = ['u', 'v', 'w', 'x', 'y', 'z']
# LETTERS = list(string.ascii_lowercase)

class MaClasseScrapy(scrapy.Spider) :
    name = 'Scraping larvf.com version 0.01'

    start_urls = ["https://www.larvf.com/identification"]

    def parse(self, response):
        # On commence par une authentification, dont la réponse n'est pas scrapée :
        yield scrapy.FormRequest(
            "https://www.larvf.com/direct/membre/identification",  # champ 'action' du formulaire
            callback=self.parse_first_page,
            method='POST',
            formdata={'memberLogin': MEMBER_LOGIN, 'memberPassword': MEMBER_PASSWORD}
        )

    def parse_first_page(self, response):
        logging.info("###### (parse_first_page) URL : " + response.url)
        L_first_letters =   LETTERS
        L_urls = ["https://www.larvf.com/domaines/alpha/" + l for l in L_first_letters]
        for url in L_urls:
            yield response.follow(url, callback=self.parse_liste_domaines)

    def parse_liste_domaines(self, response):
        logging.info("###### (parse_liste_domaines) URL : " + response.url)
        test_limit = None   # For test purpose only
        i = 0
        for r in response.css("a.DomainList-domainLink") :
            i += 1
            relative_url = r.attrib["href"]
            full_url = response.urljoin(relative_url)
            yield scrapy.Request(full_url, callback=self.parse_domaine)
            if (test_limit!=None) and (i>=test_limit):
                break

    def parse_domaine(self, response):
        logging.info("###### (parse_domaine) URL : " + response.url)
        #domain_name = response.css("h1.Title--article span.Title-content::text").get()

        test_limit = None   # For test purpose only
        i = 0
        for r in response.css("a.WineItem") :
            i += 1
            full_url = response.urljoin(r.attrib["href"])
            yield scrapy.Request(full_url, callback=self.parse_vin)
            if (test_limit!=None) and (i>=test_limit):
                break
    
    def parse_vin(self, response):
        #logging.info("###### (parse_vin) URL : " + response.url)
        winedata = dict()

        # Nom du vin, nom du domaine, étoiles du domaine :
        winedata["domain_name"] = response.css("div.Article-profileMetasContainer a.Article-domainMetaElement::text").get()
        winedata["domain_stars"] = len(response.css("div.Article-profileMetasContainer i.Article-metaStar"))
        winedata["wine_name"] = response.css("h1.Title--article span.Title-content::text").get()

        # Région et appellation :
        for s in response.css("div.Article-meta div.Article-metaElement"):
            label = s.css("label::text").get()
            value = s.css("span.Article-metaValue--highlight a::text").get()
            if (label!=None):
                winedata[label] = value

        # Nom d'image pouvant donner une indication de couleur :
        ti = response.css(r'meta[property="twitter:image"]')
        if ("content" in ti.attrib.keys()):
            winedata["vintage_twitterimg"] = re.sub(".*/", "", ti.attrib["content"])
        else:
            winedata["vintage_twitterimg"] = ""

        # Types et cépages du vin (information du vin parfois disponibles sur certains millésimes) :
        L_types = list()
        L_varieties = list()
        for desc in response.css("div.Millesime-description").getall():
            type = re.search(r"Type\sde\svin[\s]*:[\s]*([^<>]+)", desc)
            if (type!=None and type.group(1)!=None):
                L_types.extend(type.group(1).split(", "))
            cep = re.search(r"C.pages[\s]*:[\s]*([^<>]+)", desc)
            if (cep!=None and cep.group(1)!=None):
                L_varieties.append(cep.group(1))
        winedata["wine_types"] = ", ".join(L_types)
        winedata["wine_varieties"] = " | ".join(L_varieties)

        # Collecte des informations relatives à chaque millésime connu de ce vin :
        for s in response.css("div.MillesimesList div.Millesime"):
            vintagedata = winedata.copy()

            # Millésime (année) :
            vintagedata["vintage"] = re.sub(r"[^0-9]+", "", None2empty(s.css("span.Millesime-year::text").get()))

            # Types et cépages :
            desc = s.css("div.Millesime-description").get()
            type = re.search(r"Type\sde\svin[\s]*:[\s]*([^<>]+)", desc)
            if (type!=None):
                vintagedata["vintage_types"] = type.group(1)
            else:
                vintagedata["vintage_types"] = None
            cep = re.search(r"C.pages[\s]*:[\s]*([^<>]+)", desc)
            if (cep!=None):
                vintagedata["vintage_varieties"] = cep.group(1)
            else:
                vintagedata["vintage_varieties"] = None
 
            # Texte d'information sur le prix :
            rawprices = s.css("div.MillesimeReview-price").get()
            if (rawprices!=None):
                prices = re.sub(r"<div[^<>]*>", "", rawprices) 
                prices = re.sub(r"</div[^<>]*>", "", prices)
                prices = re.sub(r"<span\sclass=\"MillesimeReview-priceLibelle\">[^<>]*</span[^>]*>", "", prices)
                prices = re.sub(r"<span[^<>]*>", "", prices) 
                prices = re.sub(r"</span[^<>]*>", "", prices)
                vintagedata["vintage_prices"] = prices
            else:
                vintagedata["vintage_prices"] = ""

            # Toutes les évaluations de ce millésime :
            for s2 in s.css("div.MillesimeReview"):
                reviewdata = vintagedata.copy()
                reviewdata["review_score"] = re.sub(r"<[^<>]*>", "", None2empty(s2.css("span.WineScore").get()))
                reviewdata["review_topline"] = None2empty(s2.css("div.MillesimeReview-topLine::text").get()).strip()
                reviewdata["review_text"] = None2empty(s2.css("div.MillesimeReview-text::text").get()).strip()
                yield(reviewdata)


# ATTENTION : à précéder d’un restart kernel :
def scrape_all(filepath):
    process = CrawlerProcess(settings={
        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
        'LOG_LEVEL': logging.INFO,
        'FEEDS': {filepath : {"format": "json"}},
        'AUTOTHROTTLE_ENABLED': True 
    })
    process.crawl(MaClasseScrapy)
    process.start()

import time
filename = 'scrapy/larvf_0.01_output/scrapy_larvf_' + ''.join(LETTERS) + '_' + str(int(time.time())) + '.json'
scrape_all(filename)


2020-11-09 06:17:04 [scrapy.utils.log] INFO: Scrapy 2.4.0 started (bot: scrapybot)
2020-11-09 06:17:04 [scrapy.utils.log] INFO: Versions: lxml 4.5.2.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.8.3 (default, Jul  2 2020, 17:30:36) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1g  21 Apr 2020), cryptography 2.9.2, Platform Windows-10-10.0.18362-SP0
2020-11-09 06:17:04 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'LOG_LEVEL': 20,
 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
2020-11-09 06:17:04 [scrapy.extensions.telnet] INFO: Telnet Password: 4c78dcfcf0106780
2020-11-09 06:17:04 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats',
 'scrapy.extensions.throttle.AutoThrottle']
2020-11-09 06:17:04 [scrapy.middleware] INF

2020-11-09 06:23:58 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,maison-ughetto-audouin,12670,4259692.asp
2020-11-09 06:23:59 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,domaine-u-stiliccionu,12670,4253767.asp
2020-11-09 06:24:03 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/domaine-vrignaud,10524,406484.asp
2020-11-09 06:24:03 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/chateau-vray-houchat,10479,404392.asp
2020-11-09 06:24:04 [scrapy.extensions.logstats] INFO: Crawled 205 pages (at 31 pages/min), scraped 650 items (at 136 items/min)
2020-11-09 06:24:05 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/domaine-jean-vullien,10840,404026.asp
2020-11-09 06:24:05 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,chateau-vray-croix-de-gay,10495,403523.asp
2020-11-09 06:24:09 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/chateau-vray-canon-bodet-la-tour,10471,405911.

2020-11-09 06:32:21 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/vins-fins-du-perigord,10873,406400.asp
2020-11-09 06:32:22 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,les-vins-de-vienne,10612,402920.asp
2020-11-09 06:32:24 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,cave-des-vins-de-sancerre,10777,4021145.asp
2020-11-09 06:32:25 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/vins-de-mouzillon,10765,405520.asp
2020-11-09 06:32:26 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,la-madone,12670,4391360.asp
2020-11-09 06:32:34 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/cave-des-vins-de-bourgueil,10736,408057.asp
2020-11-09 06:32:36 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,vinotropie,10014,4284936.asp
2020-11-09 06:32:36 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,vino-vallis,10624,4022747.asp
2020-11-09 06:32:36 [root] INFO: #####

2020-11-09 06:40:39 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,chateau-vilatte,12670,4410074.asp
2020-11-09 06:40:42 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,chateau-viguerie-de-beulaygue,10855,2008660.asp
2020-11-09 06:40:46 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/atrium-georges-vigouroux,12663,403283.asp
2020-11-09 06:41:04 [scrapy.extensions.logstats] INFO: Crawled 825 pages (at 41 pages/min), scraped 2612 items (at 104 items/min)
2020-11-09 06:42:04 [scrapy.extensions.logstats] INFO: Crawled 870 pages (at 45 pages/min), scraped 2794 items (at 182 items/min)
2020-11-09 06:42:31 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/domaine-thierry-vigot,10516,407424.asp
2020-11-09 06:42:31 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/domaine-alain-vignot,10516,407874.asp
2020-11-09 06:42:32 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/domaine-fabrice-vigot,10595,40075

2020-11-09 06:52:04 [scrapy.extensions.logstats] INFO: Crawled 1231 pages (at 48 pages/min), scraped 3595 items (at 86 items/min)
2020-11-09 06:52:19 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/domaine-vignalet,10668,406514.asp
2020-11-09 06:52:20 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/chateau-des-vigiers,10844,402848.asp
2020-11-09 06:52:21 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/domaine-du-vieux-tuffeau,10778,408524.asp
2020-11-09 06:52:23 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,domaine-du-vieux-telegraphe,10612,400738.asp
2020-11-09 06:52:24 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,domaine-vieux-taillefer,12670,4409140.asp
2020-11-09 06:52:24 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/chateau-du-vieux-tinel,10623,405286.asp
2020-11-09 06:52:27 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,clos-vieux-rochers,10465,2010801.asp
20

2020-11-09 06:57:07 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,eurl-pierre-vidal,12670,4411093.asp
2020-11-09 06:57:09 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,domaine-vidal,12670,4350885.asp
2020-11-09 06:57:11 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/domaine-des-victoires,10771,408543.asp
2020-11-09 06:57:15 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,domaine-vico,12670,4251642.asp
2020-11-09 06:57:16 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/domaine-jean-luc-viaud,10755,405532.asp
2020-11-09 06:57:19 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/chateau-de-viaud,10483,405062.asp
2020-11-09 06:57:24 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,domaine-vial-magneres,10821,401182.asp
2020-11-09 06:57:24 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,maison-philippe-viallet,12670,4353630.asp
2020-11-09 06:58:04 [scrapy.ex

2020-11-09 07:09:07 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/domaine-du-veilloux,10741,401935.asp
2020-11-09 07:09:09 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/chateau-de-la-velle,10512,404769.asp
2020-11-09 07:09:10 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/domaine-clothide-et-pascal-vecten,10516,408845.asp
2020-11-09 07:10:04 [scrapy.extensions.logstats] INFO: Crawled 1870 pages (at 29 pages/min), scraped 5534 items (at 38 items/min)
2020-11-09 07:11:04 [scrapy.extensions.logstats] INFO: Crawled 1920 pages (at 50 pages/min), scraped 5627 items (at 93 items/min)
2020-11-09 07:11:14 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,domaine-vecchio,10014,4288211.asp
2020-11-09 07:11:14 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/vazart-coquart,10601,403376.asp
2020-11-09 07:11:16 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/domaine-de-vayssette,10856,402743.asp
2020-1

2020-11-09 07:18:54 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,domaine-de-valdition,12670,4020230.asp
2020-11-09 07:18:54 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,valdamor,12670,4355623.asp
2020-11-09 07:18:57 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/chateau-de-valcombe,10617,402671.asp
2020-11-09 07:19:03 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/domaine-de-valcolombe,10808,409297.asp
2020-11-09 07:19:03 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,iber-y-co,12670,4464698.asp
2020-11-09 07:19:03 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/domaine-des-valanges,10590,408339.asp
2020-11-09 07:19:03 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/domaine-valcaire,10806,407716.asp
2020-11-09 07:19:04 [scrapy.extensions.logstats] INFO: Crawled 2179 pages (at 42 pages/min), scraped 6142 items (at 149 items/min)
2020-11-09 07:19:05 [root] INFO: ###

2020-11-09 07:28:42 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/domaine-wantz,10439,404711.asp
2020-11-09 07:28:45 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/domaine-bernard-walter,10440,407710.asp
2020-11-09 07:28:48 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/walden,10824,408388.asp
2020-11-09 07:28:49 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/domaine-waegell,10439,406158.asp
2020-11-09 07:28:50 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/domaine-jean-wach-et-fils,10439,402700.asp
2020-11-09 07:28:52 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,domaine-des-marronniers,10439,401869.asp
2020-11-09 07:28:52 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,domaine-zind-humbrecht,10439,401036.asp
2020-11-09 07:28:54 [root] INFO: ###### (parse_domaine) URL : https://www.larvf.com/,domaine-paul-zinck,10439,404710.asp
2020-11-09 07:28:56 [root] INFO: ####

In [3]:
# Essai de chargement dans Pandas :
#if False:
import pandas as pd
DF_test = pd.read_json(filename)
print(DF_test.info())
DF_test.head(30)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8075 entries, 0 to 8074
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   domain_name         8075 non-null   object
 1   domain_stars        8075 non-null   int64 
 2   wine_name           8075 non-null   object
 3   Région              8044 non-null   object
 4   Appellation         8075 non-null   object
 5   vintage_twitterimg  8075 non-null   object
 6   wine_types          8075 non-null   object
 7   wine_varieties      8075 non-null   object
 8   vintage             8075 non-null   object
 9   vintage_types       911 non-null    object
 10  vintage_varieties   811 non-null    object
 11  vintage_prices      8075 non-null   object
 12  review_score        8075 non-null   object
 13  review_topline      8075 non-null   object
 14  review_text         8075 non-null   object
dtypes: int64(1), object(14)
memory usage: 946.4+ KB
None


Unnamed: 0,domain_name,domain_stars,wine_name,Région,Appellation,vintage_twitterimg,wine_types,wine_varieties,vintage,vintage_types,vintage_varieties,vintage_prices,review_score,review_topline,review_text
0,Château Yvonne,2,Saumur Le Gory,Vallée de la Loire et Centre,Saumur,bouteille-vin-vallee-loire-blanc.jpg,"Tranquille, Sec, Tranquille, Sec, Tranquille, Sec",Chenin (Blanc) 100% | Chenin (Blanc) 100% | Ch...,2016,"Tranquille, Sec",Chenin (Blanc) 100%,40 €,"16,5/20",Guide vert 2020,"En 2016, ce blanc parcellaire de haute concent..."
1,Château Yvonne,2,Saumur Le Gory,Vallée de la Loire et Centre,Saumur,bouteille-vin-vallee-loire-blanc.jpg,"Tranquille, Sec, Tranquille, Sec, Tranquille, Sec",Chenin (Blanc) 100% | Chenin (Blanc) 100% | Ch...,2016,"Tranquille, Sec",Chenin (Blanc) 100%,40 €,17/20,Guide vert 2019,Grande concentration en extraits dans cette cu...
2,Château Yvonne,2,Saumur Le Gory,Vallée de la Loire et Centre,Saumur,bouteille-vin-vallee-loire-blanc.jpg,"Tranquille, Sec, Tranquille, Sec, Tranquille, Sec",Chenin (Blanc) 100% | Chenin (Blanc) 100% | Ch...,2015,,Chenin (Blanc) 100%,40 €,"16,5/20",Guide vert 2018,Grande profondeur calcaire dans ce chenin rich...
3,Château Yvonne,2,Saumur Le Gory,Vallée de la Loire et Centre,Saumur,bouteille-vin-vallee-loire-blanc.jpg,"Tranquille, Sec, Tranquille, Sec, Tranquille, Sec",Chenin (Blanc) 100% | Chenin (Blanc) 100% | Ch...,2014,"Tranquille, Sec",,40 €,16/20,Guide vert 2017,"Bien mûr, enrichi par la profondeur des sols a..."
4,Château Yvonne,2,Saumur Le Gory,Vallée de la Loire et Centre,Saumur,bouteille-vin-vallee-loire-blanc.jpg,"Tranquille, Sec, Tranquille, Sec, Tranquille, Sec",Chenin (Blanc) 100% | Chenin (Blanc) 100% | Ch...,2014,"Tranquille, Sec",,40 €,17/20,Magazine RVF 608,"Sélection parcellaire de 15 ares, isolée dès 2..."
5,Château Yvonne,2,Saumur Le Gory,Vallée de la Loire et Centre,Saumur,bouteille-vin-vallee-loire-blanc.jpg,"Tranquille, Sec, Tranquille, Sec, Tranquille, Sec",Chenin (Blanc) 100% | Chenin (Blanc) 100% | Ch...,2013,,,39 €,16/20,Guide vert 2016,
6,Château Yvonne,2,Saumur Le Gory,Vallée de la Loire et Centre,Saumur,bouteille-vin-vallee-loire-blanc.jpg,"Tranquille, Sec, Tranquille, Sec, Tranquille, Sec",Chenin (Blanc) 100% | Chenin (Blanc) 100% | Ch...,2011,,,40 €,16/20,Guide vert 2015,
7,Château Yvonne,2,Saumur Le Gory,Vallée de la Loire et Centre,Saumur,bouteille-vin-vallee-loire-blanc.jpg,"Tranquille, Sec, Tranquille, Sec, Tranquille, Sec",Chenin (Blanc) 100% | Chenin (Blanc) 100% | Ch...,2000,,,"29,30 €",18/20,Guide vert 2004,
8,Château Yvonne,2,Saumur Le Gory,Vallée de la Loire et Centre,Saumur,bouteille-vin-vallee-loire-blanc.jpg,"Tranquille, Sec, Tranquille, Sec, Tranquille, Sec",Chenin (Blanc) 100% | Chenin (Blanc) 100% | Ch...,1999,,,,17/20,Guide vert 2004,
9,Château Yvonne,2,Saumur Le Gory,Vallée de la Loire et Centre,Saumur,bouteille-vin-vallee-loire-blanc.jpg,"Tranquille, Sec, Tranquille, Sec, Tranquille, Sec",Chenin (Blanc) 100% | Chenin (Blanc) 100% | Ch...,1999,,,,18/20,Guide vert 2003,
