<a href="https://www.kaggle.com/code/kaiyoo88/tutorial-youtube-comments-crawling?scriptVersionId=204255155" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [9]:
import googleapiclient.discovery
from googleapiclient.errors import HttpError

import pandas as pd
import time
import tqdm

In [10]:
# YouTube API key
API_KEY = "Your-API-key"
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=API_KEY)

In [11]:
# Get video ids for query. Youtube API allows only up to 50 videos

def get_video_ids(query, max_results=100):
    video_ids = []
    results_per_page = 50  # YouTube API maxResults 
    pages = (max_results + results_per_page - 1) // results_per_page  # calculate #pages
    next_page_token = None
    
    for _ in range(pages): # call api as many times as #pages
        try:
            request = youtube.search().list(
                q=query,
                part="snippet",
                maxResults=results_per_page,
                type="video",
                pageToken=next_page_token
            )
            response = request.execute()      

            # Only check if 'id' key exists and 'videoId' is accessible
            for item in response['items']:
                if isinstance(item, dict) and 'id' in item and 'videoId' in item['id']:
                    video_ids.append(item['id']['videoId'])

            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break

        except HttpError as e:
            error_reason = e.resp.get('reason')
            if error_reason == 'quotaExceeded':
                print("Quota exceeded. Saving collected data...")
                save_data_to_csv(video_comments)
                exit()
            else:
                print(f"An error occurred: {e}")
                
    return video_ids[:max_results]

In [12]:
# Get comments for 1 video. Youtube API allows only up to 100 comments per video
def get_top_korean_comments(video_id, max_results=100):
    comments = []
    try:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=max_results,
            textFormat="plainText"
        )
        response = request.execute()

        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)  
                
    except HttpError as e:
        error_reason = e.resp.get('reason')
        if error_reason == 'commentsDisabled':
            print(f"Comments are disabled for video {video_id}. Skipping.")
        else:
            print(f"An error occurred: {e}")
    
    return comments

In [13]:
# Make data to dataframe
# video_comments looks like: {"4DUYBXdUYzA": ["와 재밌다", "재미없다", ]}
def save_data_to_csv(video_comments):    
    
    data = {"Video_ID": [], "Comment": []}
    
    for video_id, comments in video_comments.items():
        for comment in comments:
            data["Video_ID"].append(video_id)
            data["Comment"].append(comment)

    df = pd.DataFrame(data)
    
    # Export to CSV 
    df.to_csv("youtube_comments.csv", index=False)

In [19]:
participants = ["흑백요리사","백종원","안성재","에드워드 리","나폴리 맛피아","트리플스타","요리하는 돌아이","최현석","장호준","여경래","안유성","정지선","최강록","조은주","오세득","파브리치오 페라리","이영숙","선경 롱게스트","김도윤","박준우"]

In [21]:
video_comments = {}

start = time.time()
query_baisic = "흑백요리사"

for participant in tqdm.tqdm(participants):
    query = query_baisic + " " + participant 

    try:
        video_ids = get_video_ids(query, max_results=50) 

        for video_id in video_ids:
            comments = get_top_korean_comments(video_id)
            video_comments[video_id] = comments
    except HttpError as e:
        if e.resp.get('reason') == 'quotaExceeded':
            print("Quota exceeded. Saving collected data...")
            save_data_to_csv(video_comments)
            exit()

    end = time.time()    
    print(f"{end - start}s for query: {query}")    

save_data_to_csv(video_comments)

  0%|          | 0/20 [00:00<?, ?it/s]

0.008403778076171875s for query: 흑백요리사 흑백요리사


 10%|█         | 2/20 [00:09<01:25,  4.73s/it]

9.454724311828613s for query: 흑백요리사 백종원


 15%|█▌        | 3/20 [00:20<02:04,  7.30s/it]

20.350581645965576s for query: 흑백요리사 안성재


 20%|██        | 4/20 [00:29<02:07,  7.95s/it]

29.39399027824402s for query: 흑백요리사 에드워드 리


 25%|██▌       | 5/20 [00:37<02:01,  8.12s/it]

37.832478284835815s for query: 흑백요리사 나폴리 맛피아


 30%|███       | 6/20 [00:45<01:49,  7.84s/it]

45.10643267631531s for query: 흑백요리사 트리플스타


 35%|███▌      | 7/20 [00:53<01:42,  7.89s/it]

53.102251291275024s for query: 흑백요리사 요리하는 돌아이


 40%|████      | 8/20 [01:01<01:38,  8.19s/it]

61.95647573471069s for query: 흑백요리사 최현석


 45%|████▌     | 9/20 [01:09<01:28,  8.05s/it]

69.69085550308228s for query: 흑백요리사 장호준


 50%|█████     | 10/20 [01:17<01:20,  8.03s/it]

77.67814421653748s for query: 흑백요리사 여경래


 55%|█████▌    | 11/20 [01:29<01:22,  9.20s/it]

89.53856015205383s for query: 흑백요리사 안유성
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=kmbWUMhdBQU&maxResults=100&textFormat=plainText&key=AIzaSyCO13D4pLvlXCF19ixXrwJtRU8-lGcutOg&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">


 60%|██████    | 12/20 [01:36<01:09,  8.64s/it]

96.89432549476624s for query: 흑백요리사 정지선


 65%|██████▌   | 13/20 [01:46<01:02,  8.89s/it]

106.35642266273499s for query: 흑백요리사 최강록


 70%|███████   | 14/20 [01:53<00:50,  8.50s/it]

113.9426064491272s for query: 흑백요리사 조은주
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=2iVC6EgKOHY&maxResults=100&textFormat=plainText&key=AIzaSyCO13D4pLvlXCF19ixXrwJtRU8-lGcutOg&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">


 75%|███████▌  | 15/20 [02:00<00:39,  7.90s/it]

120.4763879776001s for query: 흑백요리사 오세득


 80%|████████  | 16/20 [02:09<00:32,  8.16s/it]

129.2255094051361s for query: 흑백요리사 파브리치오 페라리


 85%|████████▌ | 17/20 [02:16<00:23,  7.78s/it]

136.11410927772522s for query: 흑백요리사 이영숙
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=_yOU-oKKSXg&maxResults=100&textFormat=plainText&key=AIzaSyCO13D4pLvlXCF19ixXrwJtRU8-lGcutOg&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">


 90%|█████████ | 18/20 [02:23<00:15,  7.73s/it]

143.7360303401947s for query: 흑백요리사 선경 롱게스트


 95%|█████████▌| 19/20 [02:31<00:07,  7.75s/it]

151.51886200904846s for query: 흑백요리사 김도윤


100%|██████████| 20/20 [02:38<00:00,  7.91s/it]

158.3001160621643s for query: 흑백요리사 박준우





## Merge youtube_comments with movie_rating_dataset

In [22]:
comments = pd.read_csv("youtube_comments.csv")

In [31]:
import urllib.request
# download naver movie ratings dataset
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")

('ratings.txt', <http.client.HTTPMessage at 0x7fe14e92cfa0>)

In [32]:
movie_data = pd.read_table('ratings.txt')
movie_data.head()

Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1
3,9251303,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1


In [44]:
comments.head()

Unnamed: 0,Video_ID,Comment
0,utIbM9Iua-k,너무 잼있었어요~~❤😊
1,utIbM9Iua-k,I LOVE HOW THEY DEBATED ABOUT THE MISSING RICE...
2,utIbM9Iua-k,저 라면은.. 너무 부담스럽다.\n가볍게 그런 게 라면인데..
3,utIbM9Iua-k,라면에 마늘 많이 넣으면 첫맛만 좋고 먹다 보면 이게 아닌데 하면서 질려서 못먹음
4,utIbM9Iua-k,둘다 시즌2도 잘 부탁드립니다 🤗✨️


In [43]:
print(f"movie data length: {len(movie_data)}")
print(f"comments data length: {len(comments)}")

movie data length: 200000
comments data length: 42312


In [39]:
# Merge two dataset because number of Comments dataset is not big enough to train word vectors.
df1_text = movie_data[['document']].rename(columns={'document': 'text'})
df2_text = comments[['Comment']].rename(columns={'Comment': 'text'})

# merge movie_data and yt_comments_data
merged_df = pd.concat([df1_text, df2_text], ignore_index=True)
merged_df

Unnamed: 0,text
0,어릴때보고 지금다시봐도 재밌어요ㅋㅋ
1,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산..."
2,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.
3,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...
4,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.
...,...
242307,저 여자 직원 진짜 밉상이다...내가 식사하러 갔을때 부디 안 보였으면 좋겠어요.....
242308,저분들은 누구신데 저런평가를 내리시는거죠?? 사회에서 대단한 분들이신가요.??
242309,딤섬을 잘 만드시기때문에 딤섬전문점을 하시는게더 낫지 않을까 생각이드네요
242310,T 그 자체..


In [40]:
# NULL check
print(merged_df.isnull().values.any())

True


In [41]:
merged_df = merged_df.dropna(how = 'any') # drop rows with null values
print(merged_df.isnull().values.any()) 

False


In [42]:
print(len(merged_df)) 

242303


In [46]:
# remove all characters other than Hangeul
merged_df['text'] = merged_df['text'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['text'] = merged_df['text'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True)


In [27]:
pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m69.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (488 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.6/488.6 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.0 konlpy-0.6.0
Note: you may need to restart the kernel to use updated packages.


In [28]:
from konlpy.tag import Okt
okt = Okt()

In [52]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

tokenized_data = []
for sentence in tqdm.tqdm(merged_df['text']):
    tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence if not word in stopwords] # 불용어 제거
    tokenized_data.append(stopwords_removed_sentence)

100%|██████████| 242303/242303 [14:49<00:00, 272.45it/s]


In [53]:
from gensim.models import Word2Vec

model = Word2Vec(sentences = tokenized_data, vector_size = 100, window = 5, min_count = 5, workers = 4, sg = 0)

In [54]:
model.wv.vectors.shape

(19057, 100)

In [55]:
print(model.wv.most_similar("백종원"))

[('최현석', 0.7200689911842346), ('쉐프', 0.6961478590965271), ('셰프', 0.680264413356781), ('재는', 0.6505212783813477), ('안성', 0.6470773816108704), ('정지선', 0.6408196091651917), ('이영숙', 0.630627453327179), ('사장', 0.6275045275688171), ('쉪', 0.5941402316093445), ('선경', 0.5800169706344604)]


In [56]:
print(model.wv.most_similar("최현석"))

[('정지선', 0.7785184383392334), ('셰프', 0.7356017231941223), ('백종원', 0.7200691103935242), ('쉐프', 0.7198329567909241), ('이분', 0.6777515411376953), ('솊', 0.6771531105041504), ('쉪', 0.6640861630439758), ('조은주', 0.6619386672973633), ('안성', 0.651208221912384), ('이영숙', 0.6495932340621948)]


## Save W2V model

In [57]:
model.wv.save_word2vec_format('ko_w2v')

In [58]:
!python -m gensim.scripts.word2vec2tensor --input ko_w2v --output ko_w2v

## Visualization for embedding

In [None]:
## Go to https://projector.tensorflow.org/
## and load ko_w2v_tensor.tsv and ko_w2v_metadata.tsv