In [None]:
# web crawling

In [None]:
!python -m pip install requests
!python -m pip install beautifulsoup4

In [3]:
# robots.txt

import requests

url = "https://www.naver.com/robots.txt"
response = requests.get(url)
print(response.text)

User-agent: *
Disallow: /
Allow : /$
Allow : /.well-known/privacy-sandbox-attestations.json


In [4]:
# 1. urllib

import urllib.request

url = "https://google.com"
response = urllib.request.urlopen(url)
html = response.read()
print(html)

b'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="ko"><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"><title>Google</title><script nonce="bGz6enODp4caoic0OokpfA">(function(){var _g={kEI:\'ZR8PaOLuBJDL0-kP2L2XuA4\',kEXPI:\'0,202792,3497459,698,435,623871,9708,344796,247319,42725,5241681,108,85,32768741,4043709,25228681,11556,100650,26062,14117,22911,34213,8038,6756,23879,9138,3079,1521,328,6225,2947,61218,15048,8213,7422,30376,28339,48312,1789,4105,353,2265,16615,8967,4617,5774,4309,9172,3170,4992,5968,4719,2545,7625,1635,2801,453,8,458,2531,35,7177,1911,2278,654,4883,570,8788,2396,354,658,609,3963,453,99,3506,7084,4602,836,2,622,1148,1715,1762,1157,1565,860,997,9,1,3,440,4129,1130,1596,137,704,5,3439,131,2,1,2,2,2,3,2965,1419,181,2578,13,3,1521,1249,822,943,243,2,65,334,601,3,695,775,1241,238,1131,2362,635,5,1162,981,931,66,458,17,596,623,1

In [10]:
# urlopen() 으로 웹 페이지 열기

from urllib.request import urlopen

url = "https://www.example.com"
response = urlopen(url)
html = response.read().decode('utf-8')
print(html[:300])

<!doctype html>
<html>
<head>
    <title>Example Domain</title>

    <meta charset="utf-8" />
    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <style type="text/css">
    body {
        background


In [9]:
# request 객체로 User-Agent 추가

from urllib.request import Request, urlopen

url = "https://www.example.com"
headers = {"User-Agent": "Mozilla/5.0"}
req = Request(url, headers=headers)
response = urlopen(req)
html = response.read().decode('utf-8')
print(html[:300])

<!doctype html>
<html>
<head>
    <title>Example Domain</title>

    <meta charset="utf-8" />
    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <style type="text/css">
    body {
        background


In [11]:
# urllib.parse를 이용한 URL 분석 

from urllib.parse import urlparse, parse_qs 
 
url = "https://www.example.com/search?q=python&sort=latest" 
 
parsed = urlparse(url) 
print(parsed) 
 
params = parse_qs(parsed.query) 
print(params)  # {'q': ['python'], 'sort': ['latest']} 

ParseResult(scheme='https', netloc='www.example.com', path='/search', params='', query='q=python&sort=latest', fragment='')
{'q': ['python'], 'sort': ['latest']}


In [12]:
# urllib.parse.urlencode()로 URL 파라미터 생성

from urllib.parse import urlencode

params = {
    'q': 'chatgpt',
    'lang': 'ko',
    'page': 1
} 
 
query_string = urlencode(params)
url = f"https://www.example.com/search?{query_string}"
print(url)

https://www.example.com/search?q=chatgpt&lang=ko&page=1


In [None]:
# 이미지 파일 다운로드 예제

from urllib.request import urlretrieve

image_url = "https://picsum.photos/500/300"
save_path = "image/downloaded_image.png"

urlretrieve(image_url, save_path)
print("이미지 다운로드 완료")

이미지 다운로드 완료!


In [17]:
# 크롤링 시 robots.txt 분석 (크롤링 허용 여부 확인) 
from urllib.robotparser import RobotFileParser 
 
rp = RobotFileParser() 
rp.set_url("https://www.example.com/robots.txt") 
rp.read() 
 
can_fetch = rp.can_fetch("*", "https://www.example.com/somepage") 
print("접근 가능 여부:", can_fetch) 

접근 가능 여부: True


In [None]:
# 5. requests (외부 라이브러리)

In [None]:
# 1. 기본요청-get

import requests
url = "https://example.com"
response = requests.get(url)
html = response.text
print(html)

In [19]:
import requests 
# requests 라이브러리를 불러온다 (웹 요청을 보내기 위한 라이브러리) 

url = 'https://www.python.org/' 
# 접속하고자 하는 URL을 정의한다 

response = requests.get(url) 
# 해당 URL로 GET 요청을 보내고, 응답(response) 객체를 저장한다 

print("상태 코드:", response.status_code) 
# 응답의 상태 코드(status code)를 출력한다 
# 200 이면 성공, 404는 페이지 없음, 403은 접근 금지 등등 

print("본문 일부:\n", response.text[:300])
# 응답의 본문 내용 중 앞부분 300글자만 출력한다 (전체는 너무 길 수 있으므로) 

상태 코드: 200
본문 일부:
 <!doctype html>
<!--[if lt IE 7]>   <html class="no-js ie6 lt-ie7 lt-ie8 lt-ie9">   <![endif]-->
<!--[if IE 7]>      <html class="no-js ie7 lt-ie8 lt-ie9">          <![endif]-->
<!--[if IE 8]>      <html class="no-js ie8 lt-ie9">                 <![endif]-->
<!--[if gt IE 8]><!--><html class="no-js"


In [20]:
# post 요청 보내기 

import requests 
data = {'id': 'bc_kim', 'pw': '1234'} 
response = requests.post('https://httpbin.org/post', data=data) 
print(response.json())

{'args': {}, 'data': '', 'files': {}, 'form': {'id': 'bc_kim', 'pw': '1234'}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Content-Length': '17', 'Content-Type': 'application/x-www-form-urlencoded', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.32.3', 'X-Amzn-Trace-Id': 'Root=1-680f2db5-5533cf8106e98ae956905722'}, 'json': None, 'origin': '115.95.149.11', 'url': 'https://httpbin.org/post'}


In [21]:
# requests.Request 
from requests import Request, Session 
s = Session() 
req = Request('GET', 'https://httpbin.org/get') 
prepped = s.prepare_request(req) 
resp = s.send(prepped) 
print(resp.text)

{
  "args": {}, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.32.3", 
    "X-Amzn-Trace-Id": "Root=1-680f2e80-5010fdf0313bdf763d2e0113"
  }, 
  "origin": "115.95.149.11", 
  "url": "https://httpbin.org/get"
}



In [22]:
#4. requests.Session 
s = requests.Session() 
s.headers.update({'User-Agent': 'my-app/0.0.1'}) 
response = s.get('https://httpbin.org/headers') 
print(response.json())

{'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Host': 'httpbin.org', 'User-Agent': 'my-app/0.0.1', 'X-Amzn-Trace-Id': 'Root=1-680f2e96-65e421803e1282cf56a63c60'}}
