# DHTML

In [None]:
from requests import request
from requests.exceptions import HTTPError
from time import sleep


def download(url, params = {}, method ="GET", retries = 3):
    resp = None 
    try :
        resp = request(method, url, 
                       params = params if method == "GET" else {},
                       data = params if method == "POST" else {},
                       headers = {'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'})
        resp.raise_for_status()

    
    except HTTPError as e :
        if e.response.status_code == 500 :
            if retries > 0:
                sleep(3)
                resp = download(url, params = params, method = method , retries = retries - 1) 
            else : 
                print("재방문 횟수 초과")
        else : 
            print("Request")
            print("Response")

    return resp

In [None]:
url = 'https://pythonscraping.com/pages/javascript/ajaxDemo.html'
resp = download(url)

In [None]:
resp.text

In [None]:
resp.headers['content-type']

In [None]:
from bs4 import BeautifulSoup
dom = BeautifulSoup(resp.text, 'lxml')

In [None]:
dom.select_one('#content').text.strip()

In [None]:
#sciprt : jquey : ajax 문이 나온다
dom.select('script')

In [None]:
# ajax를 넣어서 새로운 url 생성하여 다운로드 

from requests.compat import urljoin

newurl = urljoin(resp.request.url, 'loadedContent.php')
resp = download(newurl)
resp.headers['content-type']

In [None]:
# XHR 객체를 이용하여 AJAX를 통해 DHTML한 결과이
resp.text 

In [None]:
# 예제2 : 다음 포털에서 API 뜯어서 사용해보기
url = 'https://vsuggest.search.daum.net/v2/sushi/pc/get'
params = {'q' : '윤'}
resp = download(url, params)
resp.headers['content-type']

for subkey in resp.json()['subkeys']:
    print(subkey['keyword'])

In [None]:
#전체_url = 
#"https://vsuggest.search.daum.net/v2/sushi/pc/get
#?callback=jQuery36008035563564235273_1695601874736
#&q=%EC%B9%B4%EB%A6%AC%E3%84%B4%E3%85%87%E3%84%B4%E3%85%87%E3%84%B4%E3%85%87%E3%84%B4%E3%85%87%E3%84%B4%E3%85%87%E3%84%B4%E3%85%87&htype=position&_=1695601874745"

In [None]:
params['callback'] = 'jQuery36008035563564235273_1695601874736'
rsep = download(url, params)
resp. 

In [None]:
# 예제2 (2) : 넣으면 출력해주는걸 만들기

while True:
    q = input()

    if q == '종료': 
        break

    params['q'] = q
    resp = download(url, params)
    print(','.join([subkey['keyword']
                     for subkey in resp.json()['subkeys']]))

### 네이버

In [None]:

https://ac.search.naver.com/nx/ac
?q=%EA%B8%B0
&con=1&frm=nv&ans=2&r_format=json&r_enc=UTF-8&r_unicode=0&t_koreng=1&run=2&rev=4&q_enc=UTF-8&st=100&_callback=_jsonp_53

In [None]:
https://ac.search.naver.com/nx/ac?q=%EA%B6%81&con=1&frm=nv&ans=2&r_format=json&r_enc=UTF-8&r_unicode=0&t_koreng=1&run=2&rev=4&q_enc=UTF-8&st=100&_callback=_jsonp_56

In [None]:
url = 'https://ac.search.naver.com/nx/ac?q=%EA%B6%81&con=1&frm=nv&ans=2&r_format=json&r_enc=UTF-8&r_unicode=0&t_koreng=1&run=2&rev=4&q_enc=UTF-8&st=100&_callback=_jsonp_56'
params = {'q' : '윤'}
resp = download(url, params)
resp.headers['content-type']

for subkey in resp.json()['subkeys']:
    print(subkey['keyword'])

In [None]:
#브런치에서 하기
url = 'https://api.brunch.co.kr/v1/search/article'
params = {'q' : '미시간',
          'page' : '1',
          'pagesize' : '20',
          'highlighter' : 'n',
          'escape' : 'y',
          'sortBy' : 'accu'}

resp = download(url, params)
resp.headers['content-type']

In [None]:
for item in resp.json()['data']['list']:
    print(item['title'])

### 네이버 웹툰 

In [None]:
url = 'https://comic.naver.com'
resp = download(url)
dom = BeautifulSoup(resp.text, 'lxml' )

In [None]:
url = 'https://comic.naver.com/api/home/component?type=DAILY_WEBTOON&order=STAR'
resp = download(url)
resp.headers['Content-Type'] # json


In [None]:
for list in resp.json()['titleList']:
    print(list['titleName'])

### 웹툰의 회차 목록

In [None]:
url = 'https://comic.naver.com/api/article/list/info?titleId=648419'
resp = download(url)
resp.headers

In [None]:
resp.json()['titleList'][0]

### 3. 각 회차

In [None]:
url = 'https://comic.naver.com/webtoon/detail?titleId=648419&no=403&week=mon'
resp = download(url)
dom = BeautifulSoup(resp.text, 'lxml')
dom.body.select('img')

### 크롤러 사용해서 이미지 가져오기

In [None]:
import re
from requests.compat import urljoin, urlparse, urlencode

In [None]:
url = 'https://comic.naver.com/api/home/component'
URLs = []
URLs.append((url, params))
seens = []
domain = []

while URLs:
    seed = URLs.pop(0)

    resp = download(*seed) #*seed 언팩킹할 것.
    seens.append(resp.request.url)

    if resp.status_code != 200:
        continue
    
    # 웹툰 각 회차의 이미지 목록
    if re.search('text/html', resp.headers['content-type']):
        dom = BeautifulSoup(resp.text, 'html5lib')
        for link in dom.select('#sectionContWide img[src]'):
            href = link.attrs['src']
            newurl = urljoin(resp.request.url, href)

            urlc = tuple(newurl.split('?'))

            if newurl not in seens and urlc not in URLs :
                URLs.append(urlc)

# 파일 저장하기
    elif re.search('image/(?:(?:jpeg)|(?:gif)|(?:png))',
                   resp.headers['content-type']):
        fname = re.sub('[?]', '', resp.request.url.split('/')[-1])
        with open(f'./webtoon/{fname}', 'wb') as fp:
            fp.write(resp.content)

    elif re.search('appliaction/json', resp.headers['content-type']) :
        result = resp.json()
        if 'titleList' in result.keys():
            baseurl = 'https://comic.naver.com/api/article/list/info?titleId='
            for newnurl in [baseurl + r['titleId'] for r in result['titleList']]:
                urlc = tuple(newurl.split('?'))

                if newurl not in seens and urlc not in URLs:
                    URLs.append(urlc)

    elif 'articleList' in result.keys():
        baseurl = 'https://comic.naver.com/webton/detail?'
        for newurl in [baseurl + seed[-1] + '&no='+str(r['no']) for r in result['articleList']][:1]:
            urlc = tuple(newurl.split('?'))
            if newurl not in seens and urlc not in URLs:
                URLs.append(urlc)

In [None]:
import os
os.mkdir('./webtoon')
os.listdir('.')

In [None]:
len(URLs)

## Selenium

## Cookies


### 로그인 하기(?)

In [None]:
url = 'https://pythonscraping.com/pages/cookies/login.html'
resp = download(url)
dom = BeautifulSoup(resp.text, 'lxml')

In [None]:
# form에 로그인 하는게 있음

dom.select_one('form')

In [None]:
dom.select_one('form').attrs

In [None]:
for tag in dom.select('form > input[name]'):
    print(tag.attrs)

In [None]:
urljoin(resp.request.url, dom.select_one('form').attrs['action'])

In [None]:
params = []
for tag in dom.select('form > input[name]'):
    params.append(tag.attrs['name']+ '=' +'')
'&'.join(params)

In [None]:
dom.select_one('form').attrs['method']

In [None]:
newurl = urljoin(resp.request.url, dom.select_one('form').attrs['action'])
resp = download(newurl, {'username': '아무거나', 'password' : 'password'}, )

In [None]:
# 1. 쿠키로 로그인 전 LMS

from requests import get
resp = get('https://lms.sunde41.net')
resp.request.headers

In [None]:
# 2. 쿠키로 로그인한 후 LMS
resp = get('https://lms.sunde41.net', cookies = sess.cookies)
resp.text

## 실습

1. LMS 쿠키로 로그인
2. 수업게시판에 첨부자료가 있는 수업 목록만 추출하여 첨부자료 링크를 추출하기

In [None]:
from requests import get
from bs4 import BeautifulSoup

url = 'https://lms.sunde41.net/course/5'

c = {'remember_token' : '1996-07-03|303bdc8ac4d8331554acba93a67dd258b2847e29231bb5daf7c9dd18b45a980a3ca5fc64707c946f7a67d8d53d37e8eaf72cf46e3f52608eb55d139afe0fea89',
     'session' : 'eyJfZnJlc2giOmZhbHNlLCJfdXNlcl9pZCI6IjE5OTYtMDctMDMifQ.ZRDP0Q.h38wXivQBj0McYBsFnategetzEg'}

resp = get(url, cookies= c)
dom = BeautifulSoup(resp.text, 'lxml')

In [None]:
import os 
os.listdir()

In [33]:
base = 'https://lms.sunde41.net/'
for list in dom.select('.m-nav__item.course a'):
    print(base + list['href'])

https://lms.sunde41.net//static/uploads/lectures/5/Database.pdf
https://lms.sunde41.net//static/uploads/lectures/5/%E1%84%86%E1%85%A9%E1%84%83%E1%85%B3%E1%86%AB%20%E1%84%8F%E1%85%A5%E1%86%B7%E1%84%91%E1%85%B2%E1%84%90%E1%85%A5%20%E1%84%80%E1%85%AA%E1%84%92%E1%85%A1%E1%86%A8%E1%84%8C%E1%85%A1%E1%84%80%E1%85%A1%20%E1%84%8B%E1%85%A1%E1%86%AF%E1%84%8B%E1%85%A1%E1%84%8B%E1%85%A3%20%E1%84%92%E1%85%A1%E1%86%AF%20%E1%84%87%E1%85%AE%E1%84%83%E1%85%A9%E1%86%BC%20%E1%84%89%E1%85%A9%E1%84%89%E1%85%AE%E1%84%8C%E1%85%A5%E1%86%B7%E1%84%8B%E1%85%B4%20%E1%84%86%E1%85%A9%E1%84%83%E1%85%B3%E1%86%AB%E1%84%80%E1%85%A5%E1%86%BA.pdf
https://lms.sunde41.net//static/uploads/lectures/5/SQLite.pdf
https://lms.sunde41.net//static/uploads/lectures/5/0905.html
https://lms.sunde41.net//static/uploads/lectures/5/ORM.pdf
https://lms.sunde41.net//static/uploads/lectures/5/0906.html
https://lms.sunde41.net//static/uploads/lectures/5/Functional%20Programming%20in%20Python.pdf
https://lms.sunde41.net//static/uploads/lectu

In [32]:
import os 
os.mkdir('./download')
os.listdir()

['0905.db',
 '0905.db-journal',
 'db_0905.ipynb',
 'db_0906.ipynb',
 'db_0907.ipynb',
 'db_0908.ipynb',
 'db_0911.ipynb',
 'db_0913.ipynb',
 'db_0915.ipynb',
 'db_0918.ipynb',
 'db_0925.ipynb',
 'download',
 'img6.jpg',
 'sns.db',
 'teacher',
 'webtoon']

In [34]:
urls = []
base = 'https://lms.sunde41.net/'
for list in dom.select('.m-nav__item.course a'):
    urls.append(base + list['href'])
    print(base + list['href'])

https://lms.sunde41.net//static/uploads/lectures/5/Database.pdf
https://lms.sunde41.net//static/uploads/lectures/5/%E1%84%86%E1%85%A9%E1%84%83%E1%85%B3%E1%86%AB%20%E1%84%8F%E1%85%A5%E1%86%B7%E1%84%91%E1%85%B2%E1%84%90%E1%85%A5%20%E1%84%80%E1%85%AA%E1%84%92%E1%85%A1%E1%86%A8%E1%84%8C%E1%85%A1%E1%84%80%E1%85%A1%20%E1%84%8B%E1%85%A1%E1%86%AF%E1%84%8B%E1%85%A1%E1%84%8B%E1%85%A3%20%E1%84%92%E1%85%A1%E1%86%AF%20%E1%84%87%E1%85%AE%E1%84%83%E1%85%A9%E1%86%BC%20%E1%84%89%E1%85%A9%E1%84%89%E1%85%AE%E1%84%8C%E1%85%A5%E1%86%B7%E1%84%8B%E1%85%B4%20%E1%84%86%E1%85%A9%E1%84%83%E1%85%B3%E1%86%AB%E1%84%80%E1%85%A5%E1%86%BA.pdf
https://lms.sunde41.net//static/uploads/lectures/5/SQLite.pdf
https://lms.sunde41.net//static/uploads/lectures/5/0905.html
https://lms.sunde41.net//static/uploads/lectures/5/ORM.pdf
https://lms.sunde41.net//static/uploads/lectures/5/0906.html
https://lms.sunde41.net//static/uploads/lectures/5/Functional%20Programming%20in%20Python.pdf
https://lms.sunde41.net//static/uploads/lectu

In [36]:
import requests

counter = 1  # 파일명을 결정하는데 사용할 카운터
for url in urls:
    response = requests.get(url, stream=True)

    # 파일 확장자 추출 (예: .pdf, .html 등)
    file_extension = os.path.splitext(url)[-1]
    filename = os.path.join('download', f"{counter}{file_extension}")  # 다운로드 폴더에 저장할 파일명 설정

    with open(filename, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)

    print(f"Downloaded {url} to {filename}")

    counter += 1  # 카운터 값을 1 증가

Downloaded https://lms.sunde41.net//static/uploads/lectures/5/Database.pdf to download\1.pdf
Downloaded https://lms.sunde41.net//static/uploads/lectures/5/%E1%84%86%E1%85%A9%E1%84%83%E1%85%B3%E1%86%AB%20%E1%84%8F%E1%85%A5%E1%86%B7%E1%84%91%E1%85%B2%E1%84%90%E1%85%A5%20%E1%84%80%E1%85%AA%E1%84%92%E1%85%A1%E1%86%A8%E1%84%8C%E1%85%A1%E1%84%80%E1%85%A1%20%E1%84%8B%E1%85%A1%E1%86%AF%E1%84%8B%E1%85%A1%E1%84%8B%E1%85%A3%20%E1%84%92%E1%85%A1%E1%86%AF%20%E1%84%87%E1%85%AE%E1%84%83%E1%85%A9%E1%86%BC%20%E1%84%89%E1%85%A9%E1%84%89%E1%85%AE%E1%84%8C%E1%85%A5%E1%86%B7%E1%84%8B%E1%85%B4%20%E1%84%86%E1%85%A9%E1%84%83%E1%85%B3%E1%86%AB%E1%84%80%E1%85%A5%E1%86%BA.pdf to download\2.pdf
Downloaded https://lms.sunde41.net//static/uploads/lectures/5/SQLite.pdf to download\3.pdf
Downloaded https://lms.sunde41.net//static/uploads/lectures/5/0905.html to download\4.html
Downloaded https://lms.sunde41.net//static/uploads/lectures/5/ORM.pdf to download\5.pdf
Downloaded https://lms.sunde41.net//static/uploads/lec