### playnomore
- http://playnomore.co.kr/
- scrapy에서 fake-useragent 사용
- scrapy를 실행할때 아규먼트를 설정해서 실행
- pipelines에서 데이터 베이스로 데이터를 저장

In [None]:
import scrapy
import requests
from scrapy.http import TextResponse

#### 1. 프로젝트 생성

In [None]:
!rm -rf playnomore
!scrapy startproject playnomore

New Scrapy project 'playnomore', using template directory '/home/ubuntu/.pyenv/versions/3.6.9/envs/python3/lib/python3.6/site-packages/scrapy/templates/project', created in:
    /home/ubuntu/python3/notebook/scrapy/playnomore

You can start your first spider with:
    cd playnomore
    scrapy genspider example example.com


#### 2. items.py
- title, price, img, link

In [None]:
%%writefile playnomore/playnomore/items.py
import scrapy

class PlaynomoreItem(scrapy.Item):
    title = scrapy.Field()
    price = scrapy.Field()
    img = scrapy.Field()
    link = scrapy.Field()

Overwriting playnomore/playnomore/items.py


#### 3. xpath 확인
- 링크
- 링크 -> 상세페이지(제목, 이미지URL, 가격)
- fake_useragent 설치
    - pip install fake_useragent

In [None]:
from fake_useragent import UserAgent
url = "http://playnomore.co.kr/category/bag/24/"
# headers = { "User-Agent": UserAgent().chrome }
headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36" }
req = requests.get(url, headers=headers)
response = TextResponse(req.url, body=req.text, encoding="utf-8") 
response

<200 http://playnomore.co.kr/category/bag/24/>

In [None]:
# 링크
links = response.xpath(
    '//*[@id="contents"]/div[2]/div/ul/li/div/a/@href'
).extract()
links = list(map(response.urljoin, links))

In [None]:
# 상세페이지 : 제목, 가격, 이미지URL
url = links[0]
headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36" }
req = requests.get(url, headers=headers)
response = TextResponse(req.url, body=req.text, encoding="utf-8") 
response

<200 http://playnomore.co.kr/product/black-play-day-10-micro-baguette-grey-python-180/547/?cate_no=24&display_group=1>

In [None]:
title1 = response.xpath(
        '//*[@id="contents"]/div[1]/div[1]/div[2]/div[1]/font/text()'
    ).extract()
title2 = response.xpath(
        '//*[@id="contents"]/div[1]/div[1]/div[2]/div[1]/text()'
    ).extract()
title = "".join(title1) + "".join(title2)
price = response.xpath(
        '//*[@id="contents"]/div[1]/div[1]/div[2]/div[2]/text()'
    ).extract()[0]
img = "http:" + response.xpath(
        '//*[@id="contents"]/div[1]/div[1]/div[1]/div[1]/img/@src'
    ).extract()[0]
title, price, img

('[Black Play-Day 10%] MICRO BAGUETTE grey python ',
 '$ 162',
 'http://playnomore.co.kr/web/product/big/201910/596078374708bbb81ffd629a8cf88950.jpg')

#### 4. spider.py
- scrapy-fake-useragent 설치
    - pip install scrapy-fake-useragent

In [None]:
!pip list | grep fake

fake-useragent        0.1.11   
scrapy-fake-useragent 1.1.0    


In [None]:
%%writefile playnomore/playnomore/spiders/spider.py
import scrapy
from Munger.Munger.items import PlaynomoreItem

class PlaynomoreSpider(scrapy.Spider):
    name = "Playnomore"
    custom_settings = {
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
            'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
        }
    }
    
    def start_requests(self):
        url = "http://playnomore.co.kr/category/bag/24/"
        yield scrapy.Request(url, callback=self.parse)
        
    def parse(self, response):
        links = response.xpath('//*[@id="contents"]/div[2]/div/ul/li/div/a/@href').extract()
        links = list(map(response.urljoin, links))
        for link in links:
            yield scrapy.Request(link, callback=self.page_parse)
    
    def page_parse(self, response):
        item = PlaynomoreItem()
        title1 = response.xpath('//*[@id="contents"]/div[1]/div[1]/div[2]/div[1]/font/text()').extract()
        title2 = response.xpath('//*[@id="contents"]/div[1]/div[1]/div[2]/div[1]/text()').extract()
        item["title"] = "".join(title1) + "".join(title2)
        item["price"] = response.xpath('//*[@id="contents"]/div[1]/div[1]/div[2]/div[2]/text()').extract()[0]
        item["img"] = "http:" + response.xpath('//*[@id="contents"]/div[1]/div[1]/div[1]/div[1]/img/@src').extract()[0]
        item["link"] = response.url
        yield item

Overwriting playnomore/playnomore/spiders/spider.py


In [None]:
%%writefile run.sh
cd playnomore
scrapy crawl Playnomore -o playnomore.csv

Overwriting run.sh


In [None]:
!chmod +x run.sh

In [None]:
!./run.sh

2019-11-28 05:22:59 [scrapy.utils.log] INFO: Scrapy 1.8.0 started (bot: playnomore)
2019-11-28 05:22:59 [scrapy.utils.log] INFO: Versions: lxml 4.4.1.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.10.0, Python 3.6.9 (default, Oct 24 2019, 05:23:48) - [GCC 7.4.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Linux-4.15.0-1054-aws-x86_64-with-debian-buster-sid
2019-11-28 05:22:59 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'playnomore', 'FEED_FORMAT': 'csv', 'FEED_URI': 'playnomore.csv', 'NEWSPIDER_MODULE': 'playnomore.spiders', 'ROBOTSTXT_OBEY': True, 'SPIDER_MODULES': ['playnomore.spiders']}
2019-11-28 05:22:59 [scrapy.extensions.telnet] INFO: Telnet Password: 8b7b67df4e8b4ec0
2019-11-28 05:22:59 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'sc

2019-11-28 05:23:00 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://playnomore.co.kr/product/black-play-day-20-micro-candy-midnight-170/536/?cate_no=24&display_group=1> (referer: http://playnomore.co.kr/category/bag/24/)
2019-11-28 05:23:00 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://playnomore.co.kr/product/black-play-day-10-micro-moon-chocolate-180/540/?cate_no=24&display_group=1> (referer: http://playnomore.co.kr/category/bag/24/)
2019-11-28 05:23:00 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://playnomore.co.kr/product/black-play-day-20-micro-candy-black-170/506/?cate_no=24&display_group=1> (referer: http://playnomore.co.kr/category/bag/24/)
2019-11-28 05:23:00 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET http://m.playnomore.co.kr/product/black-play-day-10-micro-baguette-grey-python-180/547/?cate_no=24&display_group=1> from <GET http://m.playnomore.co.kr/product/detail.html?product_no=547&cate_no=24&display_group=1>
2019-11-28 05:

2019-11-28 05:23:00 [scrapy.core.scraper] DEBUG: Scraped from <200 http://playnomore.co.kr/product/black-play-day-10-micro-baguette-green-python-180/546/?cate_no=24&display_group=1>
{'img': 'http://playnomore.co.kr/web/product/big/201910/751de37eead5e1a82a373b0738ed1059.jpg',
 'link': 'http://playnomore.co.kr/product/black-play-day-10-micro-baguette-green-python-180/546/?cate_no=24&display_group=1',
 'price': '$ 162',
 'title': '[Black Play-Day 10%] MICRO BAGUETTE green python '}
2019-11-28 05:23:00 [scrapy.core.scraper] ERROR: Spider error processing <GET http://m.playnomore.co.kr/product/black-play-day-10-micro-baguette-grey-python-180/547/?cate_no=24&display_group=1> (referer: http://playnomore.co.kr/category/bag/24/)
Traceback (most recent call last):
  File "/home/ubuntu/.pyenv/versions/3.6.9/envs/python3/lib/python3.6/site-packages/scrapy/utils/defer.py", line 102, in iter_errback
    yield next(it)
  File "/home/ubuntu/.pyenv/versions/3.6.9/envs/python3/lib/python3.6/site-packag

In [None]:
import pandas as pd
df = pd.read_csv("playnomore/playnomore.csv")
df.tail(1)

Unnamed: 0,img,link,price,title
13,http://playnomore.co.kr/web/product/big/201910...,http://playnomore.co.kr/product/black-play-day...,$ 162,[Black Play-Day 10%] MICRO BAGUETTE green python


#### 5. argument 설정

In [None]:
%%writefile playnomore/playnomore/spiders/spider.py
import scrapy
from playnomore.items import PlaynomoreItem

class PlaynomoreSpider(scrapy.Spider):
    name = "Playnomore"
    custom_settings = {
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
            'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
        }
    }
    
    def __init__(self, category1="bag", category2=24, **kwargs):
        self.start_url = "http://playnomore.co.kr/category/{}/{}/".format(category1, category2)
        super().__init__(**kwargs)
        
    def start_requests(self):
        url = self.start_url
        yield scrapy.Request(url, callback=self.parse)
        
    def parse(self, response):
        links = response.xpath('//*[@id="contents"]/div[2]/div/ul/li/div/a/@href').extract()
        links = list(map(response.urljoin, links))
        for link in links:
            yield scrapy.Request(link, callback=self.page_parse)
    
    def page_parse(self, response):
        item = PlaynomoreItem()
        title1 = response.xpath('//*[@id="contents"]/div[1]/div[1]/div[2]/div[1]/font/text()').extract()
        title2 = response.xpath('//*[@id="contents"]/div[1]/div[1]/div[2]/div[1]/text()').extract()
        item["title"] = "".join(title1) + "".join(title2)
        item["price"] = response.xpath('//*[@id="contents"]/div[1]/div[1]/div[2]/div[2]/text()').extract()[0]
        item["img"] = "http:" + response.xpath('//*[@id="contents"]/div[1]/div[1]/div[1]/div[1]/img/@src').extract()[0]
        item["link"] = response.url
        yield item

Overwriting playnomore/playnomore/spiders/spider.py


In [None]:
%%writefile run.sh
cd playnomore
scrapy crawl Playnomore -o playnomore2.csv -a category1=shoes -a category2=25

Overwriting run.sh


In [None]:
!./run.sh

2019-11-28 05:32:14 [scrapy.utils.log] INFO: Scrapy 1.8.0 started (bot: playnomore)
2019-11-28 05:32:14 [scrapy.utils.log] INFO: Versions: lxml 4.4.1.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.10.0, Python 3.6.9 (default, Oct 24 2019, 05:23:48) - [GCC 7.4.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Linux-4.15.0-1054-aws-x86_64-with-debian-buster-sid
2019-11-28 05:32:14 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'playnomore', 'FEED_FORMAT': 'csv', 'FEED_URI': 'playnomore2.csv', 'NEWSPIDER_MODULE': 'playnomore.spiders', 'ROBOTSTXT_OBEY': True, 'SPIDER_MODULES': ['playnomore.spiders']}
2019-11-28 05:32:14 [scrapy.extensions.telnet] INFO: Telnet Password: cdd410e38b7f619d
2019-11-28 05:32:14 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 's

2019-11-28 05:32:15 [scrapy.core.scraper] DEBUG: Scraped from <200 http://playnomore.co.kr/product/sold-out-winkygirl-color-blocks-2cm-multi/140/?cate_no=25&display_group=1>
{'img': 'http://playnomore.co.kr/web/product/big/201701/140_shop7_744641.jpg',
 'link': 'http://playnomore.co.kr/product/sold-out-winkygirl-color-blocks-2cm-multi/140/?cate_no=25&display_group=1',
 'price': '$ 414',
 'title': '[SOLD OUT] WINKYGIRL COLOR BLOCKS (2cm) multi'}
2019-11-28 05:32:15 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://playnomore.co.kr/product/sold-out-winkygirl-color-blocks-metallic-navy-5cm/20/?cate_no=25&display_group=1> (referer: http://playnomore.co.kr/category/shoes/25/)
2019-11-28 05:32:15 [scrapy.core.scraper] DEBUG: Scraped from <200 http://playnomore.co.kr/product/sold-out-shy-lip-bloafer-champagne-gold/305/?cate_no=25&display_group=1>
{'img': 'http://playnomore.co.kr/web/product/big/201706/305_shop7_443176.jpg',
 'link': 'http://playnomore.co.kr/product/sold-out-shy-lip-bloafe

In [None]:
import pandas as pd
df = pd.read_csv("playnomore/playnomore2.csv")
df.tail(1)

Unnamed: 0,img,link,price,title
14,http://playnomore.co.kr/web/product/big/201702...,http://playnomore.co.kr/product/sold-out-winky...,$ 414,[SOLD OUT] WINKYGIRL COLOR BLOCKS (5cm) multi


#### 6. Mongodb에 저장
- pymongo를 pipelines.py에 적용
- pip install pymongo==2.8.1

In [None]:
!pip list | grep pymongo

pymongo               2.8.1    


In [None]:
import pymongo

In [None]:
client = pymongo.MongoClient('mongodb://13.125.153.195:27017/')
client

MongoClient('13.125.153.195', 27017)

In [None]:
db = client.playnomore
collection = db.shoes
collection

Collection(Database(MongoClient('13.125.153.195', 27017), 'playnomore'), 'shoes')

In [None]:
data = {"title":"신발"}
collection.insert(data)

ObjectId('5ddf5e60a54763659f6c9813')

##### Mongodb 모듈 파일 생성

In [None]:
%%writefile playnomore/playnomore/mongodb.py
import pymongo

client = pymongo.MongoClient('mongodb://13.125.153.195:27017/')
db = client.playnomore
collection = db.shoes

Writing playnomore/playnomore/mongodb.py


In [None]:
%%writefile playnomore/playnomore/pipelines.py
from .mongodb import collection

class PlaynomorePipeline(object):
    
    def process_item(self, item, spider):
        
        data = { "title": item["title"], 
                 "price": item["price"],
                 "img": item["img"], 
                 "link": item["link"],
               }
        
        collection.insert(data)
        
        return item

Overwriting playnomore/playnomore/pipelines.py


In [None]:
!echo "ITEM_PIPELINES = {"  >> playnomore/playnomore/settings.py

In [None]:
!echo "   'playnomore.pipelines.PlaynomorePipeline': 300," >> playnomore/playnomore/settings.py

In [None]:
!echo "}" >> playnomore/playnomore/settings.py

In [None]:
!tail -n 5 playnomore/playnomore/settings.py

#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
ITEM_PIPELINES = {
    playnomore.pipelines.PlaynomorePipeline: 300,
}


In [None]:
!cat run.sh

cd playnomore
scrapy crawl Playnomore -o playnomore2.csv -a category1=shoes -a category2=25


In [None]:
!./run.sh

Traceback (most recent call last):
  File "/home/ubuntu/.pyenv/versions/python3/bin/scrapy", line 8, in <module>
    sys.exit(execute())
  File "/home/ubuntu/.pyenv/versions/3.6.9/envs/python3/lib/python3.6/site-packages/scrapy/cmdline.py", line 114, in execute
    settings = get_project_settings()
  File "/home/ubuntu/.pyenv/versions/3.6.9/envs/python3/lib/python3.6/site-packages/scrapy/utils/project.py", line 69, in get_project_settings
    settings.setmodule(settings_module_path, priority='project')
  File "/home/ubuntu/.pyenv/versions/3.6.9/envs/python3/lib/python3.6/site-packages/scrapy/settings/__init__.py", line 294, in setmodule
    module = import_module(module)
  File "/home/ubuntu/.pyenv/versions/3.6.9/lib/python3.6/importlib/__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 994, in _gcd_import
  File "<frozen importlib._bootstrap>", line 971, in _find_and_load
  