# 유튭 랭킹 데이터 수집해서 시각화하기

In [1]:
!apt-get update > /dev/null
!pip install selenium > /dev/null
!apt install chromium-chromedriver > /dev/null





In [2]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import pandas as pd

In [3]:
options = webdriver.ChromeOptions()
options.add_argument('--headless')   # 화면없이 실행
options.add_argument('--no-sandbox')
options.add_argument("--single-process")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome('chromedriver', options=options)

In [4]:
url = 'https://youtube-rank.com/board/bbs/board.php?bo_table=youtube&page=1'
driver.get(url)

In [8]:
trs = driver.find_elements_by_tag_name('.aos-init')
len(trs)

100

In [9]:
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

In [13]:
channel_list = soup.select('.aos-init')
len(channel_list)

100

In [16]:
channel = channel_list[0]
category = channel.select_one('p.category').get_text().strip()
category

'[음악/댄스/가수]'

In [17]:
name = channel.select_one('.subject a').text.strip()
name

'BLACKPINK'

In [20]:
subscriber = channel.select_one('.subscriber_cnt').text
view = channel.select_one('.view_cnt').text
video = channel.select_one('.video_cnt').text[:-1]
subscriber, view, video

('6400만', '190억0381만', '371')

In [23]:
channels = []

for channel in channel_list:
  category = channel.select_one('p.category').get_text().strip(' \n[]')
  name = channel.select_one('.subject a').text.strip()
  subscriber = channel.select_one('.subscriber_cnt').text
  view = channel.select_one('.view_cnt').text
  video = channel.select_one('.video_cnt').text[:-1]
  channels.append([category, name, subscriber, view, video])

In [26]:
df = pd.DataFrame(channels, columns=['카테고리', '채널', '구독자수', '조회수', '동영상개수'])
df

Unnamed: 0,카테고리,채널,구독자수,조회수,동영상개수
0,음악/댄스/가수,BLACKPINK,6400만,190억0381만,371
1,음악/댄스/가수,HYBE LABELS,6030만,187억1305만,654
2,음악/댄스/가수,BANGTANTV,5640만,121억9496만,1579
3,음악/댄스/가수,SMTOWN,2850만,218억7092만,3729
4,키즈/어린이,Boram Tube Vlog [보람튜브 브이로그],2650만,110억5288만,223
...,...,...,...,...,...
95,음식/요리/레시피,까니짱 [ G-NI ],408만,10억3471만,482
96,키즈/어린이,로미유 스토리[Romiyu Story],408만,10억6557만,433
97,음악/댄스/가수,NCT DREAM,386만,3억7422만,228
98,BJ/인물/연예인,허팝Heopop,381만,31억3892만,1847


- 만과 억을 숫자로 바꾸는 함수

In [35]:
def convert_unit(s):
  # s = ''.join(s.split('억'))
  s = s.replace('억', '').replace('개', '').replace(',', '')
  s = s.replace('만', '0000')
  return f'{int(s):,d}'

In [36]:
convert_unit('190억0381만')

'19,003,810,000'

- 두번째 페이지 찾아 click

In [39]:
driver.find_element_by_xpath('//*[@id="list-skin"]/nav/span/a[1]').click()

In [40]:
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

In [41]:
channel_list = soup.select('.aos-init')
len(channel_list)

100

In [42]:
results = []

for channel in channel_list:
  category = channel.select_one('p.category').get_text().strip(' \n[]')
  name = channel.select_one('.subject a').text.strip()
  subscriber = convert_unit(channel.select_one('.subscriber_cnt').text)
  view = convert_unit(channel.select_one('.view_cnt').text)
  video = convert_unit(channel.select_one('.video_cnt').text[:-1])
  results.append([category, name, subscriber, view, video])

In [44]:
df = pd.DataFrame(results, columns=['카테고리', '채널', '구독자수', '조회수', '동영상개수'])
df.head()

Unnamed: 0,카테고리,채널,구독자수,조회수,동영상개수
0,취미/라이프,JaeYeol ASMR 재열,3800000,1011540000,970
1,BJ/인물/연예인,워크맨-Workman,3780000,679370000,141
2,BJ/인물/연예인,waveya 2011,3730000,1346280000,702
3,키즈/어린이,Lime Tube[라임튜브],3730000,2595420000,1723
4,음악/댄스/가수,WINNER,3730000,1171260000,428


- 페이지 1 - 10

In [45]:
results = []

for page in range(1, 11):
  url = 'https://youtube-rank.com/board/bbs/board.php?bo_table=youtube&page='+ str(page)
  driver.get(url)
  time.sleep(3)
  html = driver.page_source
  soup = BeautifulSoup(html, 'html.parser')
  channel_list = soup.select('.aos-init')

  for channel in channel_list:  
    category = channel.select_one('p.category').get_text().strip(' \n[]')
    name = channel.select_one('.subject a').text.strip()
    subscriber = convert_unit(channel.select_one('.subscriber_cnt').text)
    view = convert_unit(channel.select_one('.view_cnt').text)
    video = convert_unit(channel.select_one('.video_cnt').text[:-1])
    results.append([category, name, subscriber, view, video])

In [46]:
df = pd.DataFrame(results, columns=['카테고리', '채널', '구독자수', '조회수', '동영상개수'])
df.head()

Unnamed: 0,카테고리,채널,구독자수,조회수,동영상개수
0,음악/댄스/가수,BLACKPINK,64000000,19003810000,371
1,음악/댄스/가수,HYBE LABELS,60300000,18713050000,654
2,음악/댄스/가수,BANGTANTV,56400000,12194960000,1579
3,음악/댄스/가수,SMTOWN,28500000,21870920000,3729
4,키즈/어린이,Boram Tube Vlog [보람튜브 브이로그],26500000,11052880000,223


In [47]:
df.to_csv('유튜브_순위.csv', index=False)

In [48]:
driver.close()