<a href="https://www.kaggle.com/code/kaiyoo88/tutorial-youtube-comments-crawling?scriptVersionId=237163256" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import googleapiclient.discovery
from googleapiclient.errors import HttpError

import pandas as pd
import time
import tqdm

In [2]:
# YouTube API key
API_KEY = "YOUR-API-KEY" # YOUR-API-KEY
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=API_KEY)

In [3]:
# Get video ids for query. Youtube API allows only up to 50 videos

def get_video_ids(query, max_results=100):
    video_ids = []
    results_per_page = 50  # YouTube API maxResults 
    pages = (max_results + results_per_page - 1) // results_per_page  # calculate #pages
    next_page_token = None
    
    for _ in range(pages): # call api as many times as #pages
        try:
            request = youtube.search().list(
                q=query,
                part="snippet",
                maxResults=results_per_page,
                type="video",
                pageToken=next_page_token
            )
            response = request.execute()      

            # Only check if 'id' key exists and 'videoId' is accessible
            for item in response['items']:
                if isinstance(item, dict) and 'id' in item and 'videoId' in item['id']:
                    video_ids.append(item['id']['videoId'])

            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break

        except HttpError as e:
            error_reason = e.resp.get('reason')
            if error_reason == 'quotaExceeded':
                print("Quota exceeded. Saving collected data...")
                save_data_to_csv(video_comments)
                exit()
            else:
                print(f"An error occurred: {e}")
                
    return video_ids[:max_results]

In [4]:
# Get comments for 1 video. Youtube API allows only up to 100 comments per video
def get_top_korean_comments(video_id, max_results=100):
    comments = []
    try:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=max_results,
            textFormat="plainText"
        )
        response = request.execute()

        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)  
                
    except HttpError as e:
        error_reason = e.resp.get('reason')
        if error_reason == 'commentsDisabled':
            print(f"Comments are disabled for video {video_id}. Skipping.")
        elif error_reason == 'quotaExceeded':
            print("Quota exceeded. Saving collected data...")
            save_data_to_csv(video_comments)
            exit()
        else:
            print(f"An error occurred: {e}")
    
    return comments

In [5]:
# Make data to dataframe
# video_comments looks like: {"4DUYBXdUYzA": ["와 재밌다", "재미없다", ]}
def save_data_to_csv(video_comments):    
    
    data = {"Video_ID": [], "Comment": []}
    
    for video_id, comments in video_comments.items():
        for comment in comments:
            data["Video_ID"].append(video_id)
            data["Comment"].append(comment)

    df = pd.DataFrame(data)
    
    # Export to CSV 
    df.to_csv("youtube_comments.csv", index=False)

In [7]:
participants = ["흑백요리사","백종원","안성재","에드워드 리","나폴리 맛피아","트리플스타","요리하는 돌아이","최현석","장호준","여경래","안유성","정지선","최강록","조은주","오세득","파브리치오 페라리","이영숙","선경 롱게스트","김도윤","박준우"]

In [8]:
video_comments = {}
# Ex: {"4DUYBXdUYzA": ["와 재밌다", "재미없다", ]}

start = time.time()
query_baisic = "흑백요리사"

for participant in tqdm.tqdm(participants):
    query = query_baisic + " " + participant 

    try:
        video_ids = get_video_ids(query, max_results=50) 

        for video_id in video_ids:
            comments = get_top_korean_comments(video_id)
            video_comments[video_id] = comments
    except HttpError as e:
        if e.resp.get('reason') == 'quotaExceeded':
            print("Quota exceeded. Saving collected data...")
            save_data_to_csv(video_comments)
            exit()

    end = time.time()    
    print(f"{end - start}s for query: {query}")    

save_data_to_csv(video_comments)

  5%|▌         | 1/20 [00:05<01:42,  5.38s/it]

5.391307592391968s for query: 흑백요리사 흑백요리사


 10%|█         | 2/20 [00:09<01:23,  4.62s/it]

9.482851028442383s for query: 흑백요리사 백종원


 15%|█▌        | 3/20 [00:13<01:12,  4.28s/it]

13.347225427627563s for query: 흑백요리사 안성재


 20%|██        | 4/20 [00:17<01:04,  4.04s/it]

17.01892900466919s for query: 흑백요리사 에드워드 리


 25%|██▌       | 5/20 [00:21<01:00,  4.06s/it]

21.124089241027832s for query: 흑백요리사 나폴리 맛피아


 30%|███       | 6/20 [00:24<00:54,  3.91s/it]

24.747091054916382s for query: 흑백요리사 트리플스타


 35%|███▌      | 7/20 [00:28<00:48,  3.73s/it]

28.09051251411438s for query: 흑백요리사 요리하는 돌아이


 40%|████      | 8/20 [00:31<00:44,  3.71s/it]

31.75338339805603s for query: 흑백요리사 최현석


 45%|████▌     | 9/20 [00:35<00:39,  3.59s/it]

35.079195499420166s for query: 흑백요리사 장호준


 50%|█████     | 10/20 [00:38<00:34,  3.48s/it]

38.33418536186218s for query: 흑백요리사 여경래


 55%|█████▌    | 11/20 [00:41<00:31,  3.53s/it]

41.95890736579895s for query: 흑백요리사 안유성


 60%|██████    | 12/20 [00:46<00:30,  3.78s/it]

46.31067395210266s for query: 흑백요리사 정지선


 65%|██████▌   | 13/20 [00:50<00:26,  3.80s/it]

50.16686463356018s for query: 흑백요리사 최강록


 70%|███████   | 14/20 [00:53<00:22,  3.69s/it]

53.61259913444519s for query: 흑백요리사 조은주


 75%|███████▌  | 15/20 [00:56<00:17,  3.54s/it]

56.788427114486694s for query: 흑백요리사 오세득


 80%|████████  | 16/20 [00:59<00:12,  3.25s/it]

59.36181950569153s for query: 흑백요리사 파브리치오 페라리


 85%|████████▌ | 17/20 [01:02<00:09,  3.28s/it]

62.72652268409729s for query: 흑백요리사 이영숙


 90%|█████████ | 18/20 [01:06<00:06,  3.41s/it]

66.44850420951843s for query: 흑백요리사 선경 롱게스트


 95%|█████████▌| 19/20 [01:09<00:03,  3.31s/it]

69.50300288200378s for query: 흑백요리사 김도윤


100%|██████████| 20/20 [01:12<00:00,  3.63s/it]

72.69213128089905s for query: 흑백요리사 박준우





## Merge youtube_comments with movie_rating_dataset

In [9]:
comments = pd.read_csv("youtube_comments.csv")

In [10]:
comments.head()

Unnamed: 0,Video_ID,Comment
0,vebF7wUQLMo,"《흑백요리사: 요리 계급 전쟁》, 9월 17일 넷플릭스에서 시청하세요: https:..."
1,vebF7wUQLMo,빽햄요리사ㄷㄷ
2,vebF7wUQLMo,0:07
3,vebF7wUQLMo,백수저중에 옴진리교 교주가 있노 ㄷㄷㄷㄷ
4,vebF7wUQLMo,심사위원 등장씬은 대한민국 역대 등장씬 고트중에 하나다 ㄹㅇ


## SKIP

In [None]:
import urllib.request
# download naver movie ratings dataset
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")

In [None]:
movie_data = pd.read_table('ratings.txt')
movie_data.head()

In [None]:
comments.head()

In [None]:
print(f"movie data length: {len(movie_data)}")
print(f"comments data length: {len(comments)}")

In [None]:
# Merge two dataset because number of Comments dataset is not big enough to train word vectors.
df1_text = movie_data[['document']].rename(columns={'document': 'text'})
df2_text = comments[['Comment']].rename(columns={'Comment': 'text'})

# merge movie_data and yt_comments_data
merged_df = pd.concat([df1_text, df2_text], ignore_index=True)
merged_df

In [None]:
# NULL check
print(merged_df.isnull().values.any())

In [None]:
merged_df = merged_df.dropna(how = 'any') # drop rows with null values
print(merged_df.isnull().values.any()) 

In [None]:
print(len(merged_df)) 

In [None]:
# remove all characters other than Hangeul
merged_df['text'] = merged_df['text'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True)

In [None]:
# SKIP END

In [11]:
pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading jpype1-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m70.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading jpype1-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (494 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.1/494.1 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.2 konlpy-0.6.0
Note: you may need to restart the kernel to use updated packages.


In [12]:
from konlpy.tag import Okt
okt = Okt()

In [13]:
# NULL check
print(comments.isnull().values.any()) # => True

comments = comments.dropna(how = 'any') # drop rows with null values

print(comments.isnull().values.any()) # => False

True
False


In [14]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

tokenized_data = []

# merged_df['text'] => comments['Comment']
for sentence in tqdm.tqdm(comments['Comment']): 
    sentence = str(sentence).strip()
    
    if not sentence:  # 빈 문자열이면 건너뛰기
        continue
        
    tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
    stopwords_removed_sentence = [word for word in tokenized_sentence 
                                  if not word in stopwords # 조건1
                                     and len(word) >= 2 # 조건2   
                                     and word.isalpha()]  # 한글이나 영어 
    
    if stopwords_removed_sentence:  # 빈 리스트가 아니라면 추가
        tokenized_data.append(stopwords_removed_sentence)


100%|██████████| 41742/41742 [01:46<00:00, 392.66it/s]


In [17]:
pip install gensim

Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Downloading scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: scipy
  Attempting uninstall: scipy
    Found existing installation: scipy 1.14.1
    Uninstalling scipy-1.14.1:
      Successfully uninstalled scipy-1.14.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.3 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
libpysal 4.9.2 requires packaging>=2

In [18]:
from gensim.models import Word2Vec

model = Word2Vec(sentences = tokenized_data, vector_size = 100, window = 5, min_count = 5, workers = 4, sg = 0)

In [19]:
model.wv.vectors.shape

(6492, 100)

In [58]:
print(model.wv.most_similar("백종원"))

[('성재', 0.9124249219894409), ('안성', 0.8956093192100525), ('기준', 0.8790994882583618), ('의원', 0.878623902797699), ('참가자', 0.8772395253181458), ('한테', 0.873734176158905), ('램지', 0.8582898378372192), ('고든', 0.8582121729850769), ('형평성', 0.8576611280441284), ('재는', 0.8506085872650146)]


In [60]:
print(model.wv.most_similar("최현석"))

[('정지선', 0.9415532946586609), ('이랑', 0.9064624309539795), ('성재', 0.9025325179100037), ('여경', 0.9014614224433899), ('셰프', 0.8953206539154053), ('이영숙', 0.8883089423179626), ('안유', 0.8773502111434937), ('헤드', 0.8700518012046814), ('호준', 0.8651288747787476), ('제자', 0.8638734221458435)]


## Save W2V model

In [20]:
model.wv.save_word2vec_format('ko_w2v')

In [21]:
!python -m gensim.scripts.word2vec2tensor --input ko_w2v --output ko_w2v

## Visualization for embedding

In [None]:
## Go to https://projector.tensorflow.org/
## and load ko_w2v_tensor.tsv and ko_w2v_metadata.tsv