In [None]:
'''
AI로 무엇을 하고 싶은가?
어떤 AI Engineer가 되고 싶은가?
우리는 대체 뭘해야 할까?


버전이나 개발환경이 중요하진 않음. 마음대로
github 추천

비정형 데이터 핸들링이 어려움
NoSQL로 데이터베이스 관리함

반정형 데이터: 일정한 구조를 갖지만 형식이 고정되어 있지 않음.
JSON, XML

데이터 인사이트가 중요함.
데이터 분석가: 데이터 기반 의사결정
AI 개발자: 모델 성능개선, 데이터 품질
모델만 깎는 것이 아니라 모델을 활용한 서비스를 제공하는 것에 초점
모델을 개발하고 깎을 거면 이론적인 지식이 더 필요함.

데이터 수집: 공공데이터, ai-hub
크롤링, 스크래핑: requests, beautifulsoup, scrapy, selenium
'''


# scrapy를 활용한 크롤링

In [None]:
# scrapy 실습
!pip install scrapy

Collecting scrapy
  Downloading Scrapy-2.11.2-py2.py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting Twisted>=18.9.0 (from scrapy)
  Downloading twisted-24.3.0-py3-none-any.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
Collecting cssselect>=0.9.1 (from scrapy)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl (18 kB)
Collecting itemloaders>=1.0.1 (from scrapy)
  Downloading itemloaders-1.3.1-py3-none-any.whl (12 kB)
Collecting parsel>=1.5.0 (from scrapy)
  Downloading parsel-1.9.1-py2.py3-none-any.whl (17 kB)
Collecting queuelib>=1.4.2 (from scrapy)
  Downloading queuelib-1.7.0-py2.py3-none-any.whl (13 kB)
Collecting service-identity>=18.1.0 (from scrapy)
  Downloading service_identity-24.1.0-py3-none-any.whl (12 kB)
Collecting w3lib>=1.17.0 (from scrapy)
  Downloading w3lib-2.2.1-py3-non

In [None]:
import os
project_name = 'wiki_ko_al'
os.makedirs(f'{project_name}/{project_name}/spiders', exist_ok = True)

In [None]:
settings_content = """
BOT_NAME = 'my_scrapy_project'

SPIDER_MODULES = ['my_scrapy_project.spiders']
NEWSPIDER_MODULE = 'my_scrapy_project.spiders'

ROBOTSTXT_OBEY = True
"""
with open(f"{project_name}/{project_name}/settings.py", "w") as file:
    file.write(settings_content)

In [None]:
spider_content = """
import scrapy

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'http://quotes.toscrape.com/tag/humor/',
    ]

    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('span small::text').get(),
            }

        next_page = response.css('li.next a::attr(href)').get()
        if next_page is not None:
            yield response.follow(next_page, self.parse)
"""
with open(f"{project_name}/{project_name}/spiders/quotes_spider.py", "w") as file:
    file.write(spider_content)

In [None]:
!scrapy runspider wiki_ko_al/wiki_ko_al/spiders/quotes_spider.py -o quotes.json

2024-07-02 03:47:41 [scrapy.utils.log] INFO: Scrapy 2.11.2 started (bot: scrapybot)
2024-07-02 03:47:41 [scrapy.utils.log] INFO: Versions: lxml 4.9.4.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.9.1, w3lib 2.2.1, Twisted 24.3.0, Python 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0], pyOpenSSL 24.1.0 (OpenSSL 3.2.2 4 Jun 2024), cryptography 42.0.8, Platform Linux-6.1.85+-x86_64-with-glibc2.35
2024-07-02 03:47:41 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)
2024-07-02 03:47:41 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2024-07-02 03:47:41 [scrapy.extensions.telnet] INFO: Telnet Password: 03db45b03d98100e
2024-07-02 03:47:41 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'sc

In [None]:
import json

with open("quotes.json") as f:
    quotes = json.load(f)
    for quote in quotes:
        print(quote)


{'text': '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”', 'author': 'Jane Austen'}
{'text': '“A day without sunshine is like, you know, night.”', 'author': 'Steve Martin'}
{'text': '“Anyone who thinks sitting in church can make you a Christian must also think that sitting in a garage can make you a car.”', 'author': 'Garrison Keillor'}
{'text': '“Beauty is in the eye of the beholder and it may be necessary from time to time to give a stupid or misinformed beholder a black eye.”', 'author': 'Jim Henson'}
{'text': "“All you need is love. But a little chocolate now and then doesn't hurt.”", 'author': 'Charles M. Schulz'}
{'text': "“Remember, we're madly in love, so it's all right to kiss me anytime you feel like it.”", 'author': 'Suzanne Collins'}
{'text': '“Some people never go crazy. What truly horrible lives they must lead.”', 'author': 'Charles Bukowski'}
{'text': '“The trouble with having an open mind, of course, is that peop