### 유튜브 포스코인재창조원 채널에 달린 댓글을 수집한 후 엑셀로 저장

In [None]:
import json
import requests
import datetime
import pandas as pd
from bs4 import BeautifulSoup

api_key = "" #구글 클라우드 api key
api_host = "https://www.googleapis.com/youtube/v3/"
channel_id = "UCiQYRC773twEy79oUQ-Sg5g" #인재창조원 채널 ID
now_date = datetime.datetime.now().strftime("%Y-%m-%d")
save_file_name = "../outputs/"+now_date+"_유튜브 댓글 수집 결과(인재창조원 채널).xlsx"
ssl_verify = False
video_cnt = 0
comment_list = []

if ssl_verify == False:
    requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)

playlist_list_api_url = api_host + "channelSections/?part=contentDetails&channelId=" + channel_id + "&key=" + api_key
playlist_results = requests.get(playlist_list_api_url, verify=ssl_verify)
playlist_parsed = json.loads(playlist_results.text)
for playlist in playlist_parsed["items"]:
    if "contentDetails" in playlist and "playlists" in playlist["contentDetails"]:
        this_playlist_id = playlist["contentDetails"]["playlists"][0]
        print("플레이리스트 아이디 : {}".format(this_playlist_id))
        video_list_api_url = api_host + "playlistItems/?part=snippet&maxResults=100&playlistId=" + this_playlist_id + "&key=" + api_key
        video_results = requests.get(video_list_api_url, verify=ssl_verify)
        video_parsed = json.loads(video_results.text)
        for video in video_parsed["items"]:
            video_cnt += 1
            print(" └비디오 아이디 : {} ({})".format(video["snippet"]["resourceId"]["videoId"], video["snippet"]["title"]))
            comment_list_api_url = "https://www.googleapis.com/youtube/v3/commentThreads/?part=snippet&maxResults=100&videoId=" + video["snippet"]["resourceId"]["videoId"] +"&key=" + api_key
            comment_results = requests.get(comment_list_api_url, verify=ssl_verify)
            comment_parsed = json.loads(comment_results.text)
            if "items" in comment_parsed:
                for comment in comment_parsed["items"]:
                    comment_list.append({
                        "playlist id" : this_playlist_id,
                        "영상제목" : video["snippet"]["title"],
                        "video_id" : video["snippet"]["resourceId"]["videoId"],
                        "댓글" : comment["snippet"]["topLevelComment"]["snippet"]["textOriginal"],
                        "작성자" : comment["snippet"]["topLevelComment"]["snippet"]["authorDisplayName"]
                    });
print("총 영상수 : {}".format(video_cnt))
df = pd.DataFrame(comment_list)
df.to_excel(save_file_name)
print("총 댓글수 : {}".format(len(comment_list)))
print("댓글 수집을 완료하였습니다. 저장된 파일명 [{}]".format(save_file_name))

### 댓글에서 단어별 빈도 계산하여 워드클라우드 만들기

In [None]:
import re
import chardet 
from konlpy.tag import Kkma
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt

kkma=Kkma()
text_data = ""
for comment in comment_list:
    text_data += " " + comment["댓글"]
    
text_data = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', text_data) #특수문자 제거
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00010000-\U0010FFFF"  #BMP characters 이외
                           "]+", flags=re.UNICODE)
text_data = emoji_pattern.sub(r'', text_data)

print("형태소 분석 중...")
data_pos=kkma.pos(text_data)
data_arr=[]

print("명사만 필터링하는 중...")
replace_words = {
    "포스" : "포스코",
    "신입" : "신입사원",
    "사원" : "신입사원",
    "튜브" : "유튜브"
}
for word_pos in data_pos:
    word=word_pos[0]
    pos=word_pos[1]
    if word in replace_words:
        word = replace_words[word]
    if pos=="NNG" : #명사만 필터링함. 동사도 포함하려면 or pos=="VA" 추가할 것
        data_arr.append(word)

print("단어별 발생빈도를 정렬하는 중...")
counter=Counter(data_arr).most_common()
keywords_and_frequency={}

print("한 글자 이상 단어, 빈도수 2 이상인 것만 필터링하는 중...")
for keyword in counter:
    word=keyword[0]
    freq=keyword[1]
    if len(word)>1 and freq>2: #한 글자 이상 단어 + 빈도수가 2 이상인 것만 추출
        keywords_and_frequency[word]=freq
        
print("형태소 및 빈도 분석 완료!")

font_path="NanumBarunGothicBold.ttf"
wordcloud=WordCloud(
    font_path=font_path,
    width=800,
    height=800,
    background_color="white"
)
wordcloud=wordcloud.generate_from_frequencies(keywords_and_frequency)
array=wordcloud.to_array()

fig=plt.figure(figsize=(10, 10))
plt.axis("off")
plt.imshow(array, interpolation="bilinear")

save_file_name = "../outputs/wordcloud.png"
fig.savefig(save_file_name)
print("워드클라우드 생성을 완료하였습니다. 저장된 파일명 [{}]".format(save_file_name))
plt.show()