-
Notifications
You must be signed in to change notification settings - Fork 10
/
spider.py
88 lines (67 loc) · 3.22 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import scrapy
import json
import re
from product_scraping.items import Product
class CaWalmartBot(scrapy.Spider):
name = 'ca_walmart'
allowed_domains = ['walmart.ca']
start_urls = ['https://www.walmart.ca/en/grocery/fruits-vegetables/fruits/N-3852']
header = {
'Host': 'www.walmart.ca',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Content-Type': 'application/json',
'Connection': 'keep-alive'
}
def parse(self, response):
for url in response.css('.product-link::attr(href)').getall():
yield response.follow(url, callback=self.parse_html, cb_kwargs={'url': url})
next_page = response.css('#loadmore::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
def parse_html(self, response, url):
item = Product()
branches = {'3106': ['43.656422', '-79.435567'], '3124': ['48.412997', '-89.239717']}
gral_dict = json.loads(re.findall(r'(\{.*\})', response.xpath("/html/body/script[1]/text()").get())[0])
prod_dict = json.loads(response.css('.evlleax2 > script:nth-child(1)::text').get())
sku = prod_dict['sku']
description = prod_dict['description']
name = prod_dict['name']
brand = prod_dict['brand']['name']
image_url = prod_dict['image']
upc = gral_dict['entities']['skus'][sku]['upc']
category = gral_dict['entities']['skus'][sku]['facets'][0]['value']
for i in range(3):
category = ' | '.join([gral_dict['entities']['skus'][sku]['categories'][0]['hierarchy'][i]['displayName']['en'], category])
package = gral_dict['entities']['skus'][sku]['description']
item['barcodes'] = ', '.join(upc)
item['store'] = response.xpath('/html/head/meta[10]/@content').get()
item['category'] = category
item['package'] = package
item['url'] = self.start_urls[0] + url
item['brand'] = brand
item['image_url'] = ', '.join(image_url)
item['description'] = description.replace('<br>', '')
item['sku'] = sku
item['name'] = name
url_store = 'https://www.walmart.ca/api/product-page/find-in-store?' \
'latitude={}&longitude={}&lang=en&upc={}'
for k in branches.keys():
yield scrapy.http.Request(url_store.format(branches[k][0], branches[k][1], upc[0]),
callback=self.parse_api, cb_kwargs={'item': item},
meta={'handle_httpstatus_all': True},
dont_filter=False, headers=self.header)
def parse_api(self, response, item):
store_dict = json.loads(response.body)
branch = store_dict['info'][0]['id']
stock = store_dict['info'][0]['availableToSellQty']
if 'sellPrice' not in store_dict['info'][0]:
price = 0
else:
price = store_dict['info'][0]['sellPrice']
item['branch'] = branch
item['stock'] = stock
item['price'] = price
yield item