In [2]:
import requests
import bs4

print(requests.__version__)
print(bs4.__version__)

2.31.0
4.12.3


## 네이버 상태 코드

In [3]:
URL = 'https://www.naver.com/'

req = requests.get(URL)
print(req.status_code)

200


In [4]:
# req.text

In [5]:
from bs4 import BeautifulSoup
import pandas as pd

with open('index.html', 'r', encoding='UTF8') as f:

    # step 01: 데이터 수집
    contents = f.read()

    # step 02: 데이터 파싱 (순수한 HTML 파일을 BeautifulSoup 객체로 변환)
    # HTML(문자열) == BeautifulSoup 문법 사용 ==> 크롤링
    
    soup = BeautifulSoup(contents, 'lxml')
    # print(soup)
    
#    print(soup.h2)
#    print(soup.ul)
#    print("------")
#    print(soup.ul.li)
    # 4개의 li태그에 있는 회사명을 모두 가져오는 것이 목적

    # step 03: 데이터 수집 위한 특정 태그 찾기
    companies = []
    print(soup.find_all('li'))

    # step 04: 데이터 가공
    for tag in soup.find_all('li'):
        companies.append(tag.text)
    print(companies)

    # step 05: 처리된 데이터 저장 pandas 데이터프레임
    crawling_dict = {'회사명': companies}
    result = pd.DataFrame(crawling_dict)
    print(result)

    # step 06: csv 파일로 내보내기 or DB로 내보내기
    result.to_csv("result.csv", index=False)

[<li>애플</li>, <li>삼성</li>, <li>노키아</li>, <li>LG</li>]
['애플', '삼성', '노키아', 'LG']
   회사명
0   애플
1   삼성
2  노키아
3   LG


## 벅스 뮤직

In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd 

def crawling(soup) :
    # print(soup)
    tbody = soup.find("tbody")
    result = []
    for p in tbody.find_all('p', class_ = 'title'):
        result.append(p.get_text().strip())
    return result

def main() :
    custom_header = {
        'referer' : 'https://music.bugs.co.kr/',
        'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }
    url = "https://music.bugs.co.kr/chart" # 크롤링 하려는 웹사이트
    req = requests.get(url, headers = custom_header)
    
    soup = BeautifulSoup(req.text, "html.parser")
    crawling(soup)

    titles = crawling(soup)
    print(pd.DataFrame({"노래제목" : titles}))


if __name__ == "__main__" :
    main()

                              노래제목
0                    Love wins all
1                             Wife
2                         Love 119
3                            To. X
4                             에피소드
..                             ...
95                         사랑을 하다가
96  Smoke (Prod. Dynamicduo, Padi)
97                        미워 (Ego)
98                           밤, 바다
99            사랑은 먼 길을 돌아온 메아리 같아서

[100 rows x 1 columns]


## 네이버 주식

In [40]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import warnings
import time
import random

warnings.simplefilter(action='ignore', category=FutureWarning)

company_code = '005930' # 삼성전자
url ="https://finance.naver.com/item/sise_day.nhn?code=" + company_code

    
headers = { 
             'referer' : 'https://finance.naver.com/item/sise.naver?code=005930',
             'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
            }

N = 2
# pd.concat()
df = None
for pageNum in range(1,N + 1):
    
    url1 = 'https://finance.naver.com/item/sise_day.naver?code=005930&page={pageNum}'
    req = requests.get(url, headers=headers)
    soup = BeautifulSoup(req.content, "html.parser")
    result = pd.read_html(req.text, encoding='euc-kr')[0]
    # print(result)
    # print(type(result))
    df = pd.concat([df, result], ignore_index=True)


df

Unnamed: 0,날짜,종가,전일비,시가,고가,저가,거래량
0,,,,,,,
1,2024.01.31,72700.0,1600.0,73400.0,74000.0,72500.0,13080752.0
2,2024.01.30,74300.0,100.0,75000.0,75300.0,73700.0,12244418.0
3,2024.01.29,74400.0,1000.0,73800.0,75200.0,73500.0,13976521.0
4,2024.01.26,73400.0,700.0,73700.0,74500.0,73300.0,11160062.0
5,2024.01.25,74100.0,100.0,74200.0,74800.0,73700.0,11737747.0
6,,,,,,,
7,,,,,,,
8,,,,,,,
9,2024.01.24,74000.0,1200.0,75200.0,75200.0,73500.0,12860661.0


In [41]:
df.dropna(inplace=True)
df

Unnamed: 0,날짜,종가,전일비,시가,고가,저가,거래량
1,2024.01.31,72700.0,1600.0,73400.0,74000.0,72500.0,13080752.0
2,2024.01.30,74300.0,100.0,75000.0,75300.0,73700.0,12244418.0
3,2024.01.29,74400.0,1000.0,73800.0,75200.0,73500.0,13976521.0
4,2024.01.26,73400.0,700.0,73700.0,74500.0,73300.0,11160062.0
5,2024.01.25,74100.0,100.0,74200.0,74800.0,73700.0,11737747.0
9,2024.01.24,74000.0,1200.0,75200.0,75200.0,73500.0,12860661.0
10,2024.01.23,75200.0,100.0,75700.0,75800.0,74300.0,14786224.0
11,2024.01.22,75100.0,400.0,75900.0,76000.0,75000.0,19673375.0
12,2024.01.19,74700.0,3000.0,73500.0,74700.0,73000.0,23363427.0
13,2024.01.18,71700.0,700.0,71600.0,72000.0,70700.0,17853397.0


### 최종 완성본

In [42]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import warnings
import time
import random
warnings.filterwarnings('ignore') 

def crawling(url, headers, soup):
    last_page = int(soup.select_one('td.pgRR').a['href'].split('=')[-1])
    
    df = None
    count = 0
    for page in range(1, last_page + 1):
      req = requests.get(f'{url}&page={page}', headers=headers)
      df = pd.concat([df, pd.read_html(req.text, encoding = "euc-kr")[0]], ignore_index=True)
      if count > 10:
        break
      count += 1
      time.sleep( random.uniform(2,4)) 

    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    return df

def main():
    company_code = '005930' # 삼성전자
    url ="https://finance.naver.com/item/sise_day.nhn?code=" + company_code
    
    headers = { 
             'referer' : 'https://finance.naver.com/item/sise.naver?code=005930',
             'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
            }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    result = crawling(url, headers, soup)
    print(result)

if __name__ == "__main__":
    main()

             날짜       종가     전일비       시가       고가       저가         거래량
0    2024.01.31  72700.0  1600.0  73400.0  74000.0  72500.0  13080752.0
1    2024.01.30  74300.0   100.0  75000.0  75300.0  73700.0  12244418.0
2    2024.01.29  74400.0  1000.0  73800.0  75200.0  73500.0  13976521.0
3    2024.01.26  73400.0   700.0  73700.0  74500.0  73300.0  11160062.0
4    2024.01.25  74100.0   100.0  74200.0  74800.0  73700.0  11737747.0
..          ...      ...     ...      ...      ...      ...         ...
115  2023.08.10  68000.0   900.0  68300.0  68500.0  67800.0  10227311.0
116  2023.08.09  68900.0  1300.0  68000.0  69600.0  67900.0  17259673.0
117  2023.08.08  67600.0   900.0  69000.0  69100.0  67400.0  14664709.0
118  2023.08.07  68500.0   200.0  67700.0  69200.0  67600.0  10968505.0
119  2023.08.04  68300.0   500.0  68800.0  69100.0  68200.0  12360193.0

[120 rows x 7 columns]


In [5]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import warnings
import time
import random
warnings.filterwarnings('ignore') 

def crawling(url, headers, soup):
    last_page = int(soup.select_one('td.pgRR').a['href'].split('=')[-1])
    
    df = None
    count = 0
    for page in range(1, last_page + 1):
      req = requests.get(f'{url}&page={page}', headers=headers)
      df = pd.concat([df, pd.read_html(req.text, encoding = "euc-kr")[0]], ignore_index=True)
      if count > 10:
        break
      count += 1
      time.sleep( random.uniform(2,4)) 

    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    return df

def main():
    company_code = '000270' # 기아
    url ="https://finance.naver.com/item/sise_day.nhn?code=" + company_code
    
    headers = { 
             'referer' : 'https://finance.naver.com/item/sise.naver?code=000270',
             'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
            }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    result = crawling(url, headers, soup)
    print(result)

if __name__ == "__main__":
    main()

             날짜        종가     전일비        시가        고가       저가        거래량
0    2024.01.31  102900.0  4900.0   99400.0  103000.0  98500.0  3742060.0
1    2024.01.30   98000.0  1900.0  101000.0  103600.0  97800.0  3463949.0
2    2024.01.29   99900.0  5500.0   94700.0  100000.0  94400.0  3200604.0
3    2024.01.26   94400.0  1400.0   94600.0   95700.0  92700.0  2383070.0
4    2024.01.25   93000.0  5100.0   87100.0   93000.0  86100.0  3397759.0
..          ...       ...     ...       ...       ...      ...        ...
115  2023.08.10   78900.0     0.0   78800.0   79100.0  77800.0  1050522.0
116  2023.08.09   78900.0   600.0   77700.0   78900.0  77700.0   946574.0
117  2023.08.08   78300.0   700.0   77700.0   78600.0  77700.0  1527761.0
118  2023.08.07   77600.0  3000.0   79900.0   80000.0  77300.0  2254858.0
119  2023.08.04   80600.0   100.0   80900.0   81100.0  80100.0   616785.0

[120 rows x 7 columns]
