# BeautifulSoup, Mecab, WordCloudを使って千鳥の漫才を可視化
BeautifulSoup  
http://kondou.com/BS4/  
Mecab  
https://taku910.github.io/mecab/  
WordCloud  
https://github.com/SamuraiT/mecab-python3#installation  


<a href="https://colab.research.google.com/github/kaz12tech/ai_demos/blob/master/mecab_wordcloud.ipynb" target="_blank"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Mecabのインストール
!pip install mecab-python3
!pip install unidic-lite
!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git
!sudo apt install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file
%cd /content/mecab-ipadic-neologd
!./bin/install-mecab-ipadic-neologd -n # yesを入力

In [None]:
import MeCab
from wordcloud import WordCloud
from collections import Counter
from bs4 import BeautifulSoup
import requests
import json
import re
from wordcloud import WordCloud

In [None]:
# 日本語フォントをインストール
!apt -y install fonts-ipafont-gothic

In [None]:
def get_noun(text):
  #MeCabで形態素解析
  mecab = MeCab.Tagger('-Ochasen -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd')
  node = mecab.parseToNode(text)
  words = []

  while node is not None:

      #品詞と品詞細分類１を抽出
      pos_type = node.feature.split(',')[0]
      subtype = node.feature.split(',')[1]

      #品詞が名詞、品詞細分類１が一般のとき
      if pos_type in ['名詞']:
          if subtype in ['一般']:
            # "ー"だけは除去
            if node.surface != "ー":
              words.append(node.surface)

      node = node.next
  # 出現数を集計し、ソート
  words_count = Counter(words)
  result = words_count.most_common()

  return result

In [None]:
def show_wordcloud(result):
  # 辞書型に変換
  dic_result = dict(result)

  # Word Cloudで画像生成
  wordcloud = WordCloud(
      background_color='black',
      font_path='/usr/share/fonts/truetype/fonts-japanese-gothic.ttf',
      width=900,
      height=600,
      colormap='tab10'
      ).fit_words(dic_result)

  # 画像の表示
  import matplotlib.pyplot as plt 
  from matplotlib import rcParams

  plt.figure(figsize=(15,10))
  plt.imshow(wordcloud)
  plt.axis("off")
  plt.show()

In [None]:
#試しにMeCabで形態素解析
mecab = MeCab.Tagger('-Ochasen -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd')
data = mecab.parse('これからWebスクレイピングで漫才のテキストデータを取得しWordCloudでデータビジュアライゼーションします。')
print(data)

In [None]:
url = 'https://www.smule.com/song/%E5%8D%83%E9%B3%A5-%E6%BC%AB%E6%89%8D-%E3%82%AF%E3%82%BB%E3%81%AE%E3%81%99%E3%81%94%E3%81%84%E5%AF%BF%E5%8F%B8%E5%B1%8B-%E5%8D%83%E9%B3%A5-karaoke-lyrics/6268102_6268102/arrangement'
res = requests.get(url)
print(res) # 呼びすぎると418が返るため注意

In [None]:
# BeautifulSourpでWebページから情報取得
soup = BeautifulSoup(res.content, "lxml", from_encoding='utf-8')
for script_tag in soup.find_all('script'):
  if 'window.DataStore' in str(script_tag):
    target_text = ''
    text = script_tag.get_text()
    # 改行で分割
    lines = text.splitlines()
    for line in lines:
      if "Song:" in line:
        target_text = line
        #"Song:"を除去
        target_text = target_text.replace("Song:", "")
        # 末尾の","を除去
        target_text = target_text[::-1].replace(",", "", 1)[::-1]
        dialogue = json.loads(target_text)['lyrics']
        print(dialogue)

In [None]:
# tag除去
dialogue = re.sub('<.+?>', '', dialogue)
# ドゥーゾーはドゥーゾに統一
dialogue = dialogue.replace("ドゥーゾー", "ドゥーゾ")
print(dialogue)

In [None]:
%cd /content/
# 青空文庫から羅生門をダウンロード
!curl -O "https://www.aozora.gr.jp/cards/000879/files/127_ruby_150.zip"
# zipファイルを解凍
!unzip 127_ruby_150.zip
# 文章部分のみを別ファイルに保存
!sed -n 18,54p rashomon.txt > rashomon_content.txt
#テキストファイル読み込み
file = open('/content/rashomon_content.txt',encoding = 'shift_jis')
book_text = file.read()

In [None]:
#MeCabで形態素解析
# 漫才
manzai_result = get_noun(dialogue)
# 羅生門
book_result = get_noun(book_text)

In [None]:
# Word Cloudで可視化
show_wordcloud(manzai_result)
show_wordcloud(book_result)