# Youtube Thumbnail Crawling

### Setup

In [8]:
!pip3 install --upgrade google-api-python-client

Collecting google-api-python-client
  Using cached google_api_python_client-2.77.0-py2.py3-none-any.whl (11.0 MB)
Collecting uritemplate<5,>=3.0.1
  Using cached uritemplate-4.1.1-py2.py3-none-any.whl (10 kB)
Collecting google-auth<3.0.0dev,>=1.19.0
  Using cached google_auth-2.16.0-py2.py3-none-any.whl (177 kB)
Collecting google-auth-httplib2>=0.1.0
  Using cached google_auth_httplib2-0.1.0-py2.py3-none-any.whl (9.3 kB)
Collecting httplib2<1dev,>=0.15.0
  Using cached httplib2-0.21.0-py3-none-any.whl (96 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5
  Using cached google_api_core-2.11.0-py3-none-any.whl (120 kB)
Collecting protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5
  Downloading protobuf-4.21.12-cp39-cp39-win_amd64.whl (527 kB)
     -------------------------------------- 527.0/527.0 kB 4.7 MB/s eta 0:00:00
Collecting googleapis-common-protos<2.0dev,>=1.56.2
  Using cached googleapis_common_proto

In [16]:
from googleapiclient.discovery import build
from IPython.display import JSON
import pandas as pd

In [54]:
# insert personal API key here
api_key = ''

### Channel Statistics Function

In [14]:
# all the channels to be seen
channel_ids = ['UCUj6rrhMTR9pipbAWBAMvUQ',]

In [17]:
api_service_name = "youtube"
api_version = "v3"

# Get credentials and create an API client
youtube = build(
    api_service_name, api_version, developerKey=api_key)

In [35]:
def get_channel_stats(youtube, channel_ids):
    
    all_data = []
    
    request = youtube.channels().list(
        part="snippet,contentDetails,statistics",
        id=','.join(channel_ids)
    )
    response = request.execute()

    # loop through items
    for item in response['items']:
        data = {'channelName': item['snippet']['title'],
                'views': item['statistics']['viewCount'],
                'totalVideos': item['statistics']['videoCount'],
                'playlistId': item['contentDetails']['relatedPlaylists']['uploads']
               }
        
        all_data.append(data)
        
    return(pd.DataFrame(all_data))

In [36]:
channel_stats = get_channel_stats(youtube, channel_ids)

In [37]:
channel_stats

Unnamed: 0,channelName,views,totalVideos,playlistId
0,침착맨,1512708980,6427,UUUj6rrhMTR9pipbAWBAMvUQ


### Video ID Function

In [43]:
# 다음은 video id를 출력해주는 함수이다
def get_video_ids(youtube, playlist_id):
    video_ids = []
    
    request = youtube.playlistItems().list(
        part="snippet,contentDetails",
        playlistId=playlist_id,
        maxResults = 10
    )
    response = request.execute()
    
    for item in response['items']:
        video_ids.append(item['contentDetails']['videoId'])
    
    # 한번에 불러올 수 있는 최대 영상 수가 50개임으로 아래 코드를 통해 next page를 불러온다
    next_page_token = response.get('nextPageToken')
    while next_page_token is not None:
        request = youtube.playlistItems().list(
            part="snippet,contentDetails",
            playlistId=playlist_id,
            maxResults = 50,
            pageToken = next_page_token)
        response = request.execute()

        for item in response['items']:
            video_ids.append(item['contentDetails']['videoId'])

        next_page_token = response.get('nextPageToken')
        
    return video_ids


In [44]:
# 39개의 영상이 들어있는 '2023년 침착맨 정주행' 플레이리스트
playlist_id = "PLif_jr7pPZACDdM6sB6Yr_0L0VGXEjF1b"

# video id를 불러오는 함수를 위 플레이리스트를 변수로 호출
video_ids = get_video_ids(youtube, playlist_id)

In [45]:
# 39개의 영상 id가 출력되는것을 확인
len(video_ids)

39

### Video Info Function

In [52]:
# this is the function that @haemilia created for crawling the desired youtube information

def get_video_info(youtube, video_ids):

    request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=video_ids
        )
    response = request.execute()

    video_info = []

    for video in response['items']:
        keep = {'Published': video['snippet']['publishedAt'],
        'ThumbnailUrl': video['snippet']['thumbnails']['default']['url'],
        'ViewCount': video['statistics']['viewCount'],
        'VideoId': video['id']}

        video_info.append(keep)

    return(pd.DataFrame(video_info))

In [47]:
video_df = get_video_info(youtube, video_ids)
video_df

Unnamed: 0,Published,ThumbnailUrl,ViewCount,VideoId
0,2023-01-01T10:00:23Z,https://i.ytimg.com/vi/ysetd_r8Z9M/default.jpg,648900,ysetd_r8Z9M
1,2023-01-02T10:00:09Z,https://i.ytimg.com/vi/NeTqujXzubM/default.jpg,650495,NeTqujXzubM
2,2023-01-03T10:00:21Z,https://i.ytimg.com/vi/RRnO-9xfJY8/default.jpg,1262715,RRnO-9xfJY8
3,2023-01-04T10:00:08Z,https://i.ytimg.com/vi/K6Pfi0yLav8/default.jpg,1261217,K6Pfi0yLav8
4,2023-01-05T10:00:10Z,https://i.ytimg.com/vi/SP-LJqVgQuw/default.jpg,3685979,SP-LJqVgQuw
5,2023-01-06T10:00:36Z,https://i.ytimg.com/vi/vO9sb-w1gdY/default.jpg,1094557,vO9sb-w1gdY
6,2023-01-07T10:00:34Z,https://i.ytimg.com/vi/z_OdvenYnAs/default.jpg,1184344,z_OdvenYnAs
7,2023-01-08T10:00:07Z,https://i.ytimg.com/vi/L2ZBNP_4m8E/default.jpg,677070,L2ZBNP_4m8E
8,2023-01-09T10:00:19Z,https://i.ytimg.com/vi/QAlkUzhL03U/default.jpg,618567,QAlkUzhL03U
9,2023-01-12T10:00:36Z,https://i.ytimg.com/vi/za_EbAThN1A/default.jpg,420462,za_EbAThN1A


In [48]:
# function to create CSV file from DF
video_df.to_csv('2023.csv')

## 2021 Calmdownman Playlist Crawling

In [49]:
# 347개의 영상이 들어있는 '2021년 침착맨 정주행' 플레이리스트
playlist_id = "PLif_jr7pPZADJ2MJ-iHlgLijM6xkCaFlo"

# video id를 불러오는 함수를 위 플레이리스트를 변수로 호출
video_ids = get_video_ids(youtube, playlist_id)

In [50]:
# 영상이 총 347개 저장 되었는지 확인
len(video_ids)

347

In [55]:
# '2021년' 플레이리스트에 대한 정보 갖고오기
df = get_video_info(youtube, video_ids)
df

HttpError: <HttpError 400 when requesting https://youtube.googleapis.com/youtube/v3/videos returned "The request specifies an invalid filter parameter.". Details: "[{'message': 'The request specifies an invalid filter parameter.', 'domain': 'youtube.parameter', 'reason': 'invalidFilters', 'location': 'parameters.', 'locationType': 'other'}]">