<a href="https://colab.research.google.com/github/kunishou/Talking_Robot/blob/main/04_talking_robot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# VOICEVOX set up

In [None]:
# パッケージのクローンとセットアップ
!git clone https://github.com/VOICEVOX/voicevox_core -b 0.11.4
%cd voicevox_core
!python configure.py --use_gpu --voicevox_version 0.11.4

# パッケージのインストール
!pip install -r -q requirements.txt
!pip install -q .

# pyopenjtalkのインストール
!pip install -q pyopenjtalk

# Whisper set up

In [None]:
!pip install -q git+https://github.com/openai/whisper.git

In [None]:
import whisper
model = whisper.load_model("large")

In [None]:
# 英単語→カナ変換用ライブラリ
!pip install -q mecab-python3
!pip install -q unidic
!pip install alkana
!python -m unidic download

# BERT set up

In [None]:
# Huggingface Transformersのインストール
!git clone https://github.com/huggingface/transformers
%cd transformers
!pip install -q .

# 日本語対応パッケージのインストール
!pip install -q fugashi[unidic-lite]
!pip install -q ipadic

# Huggingface Datasetsのインストール
!pip install -q datasets

In [None]:
from transformers import BertJapaneseTokenizer, AutoModelForQuestionAnswering
import torch

# 入力テキスト
context = "私の名前はサンタクロースです。 \
          年齢は26歳です。 \
          北海道に住んでいます。 \
          昨日は東京に出かけました。 \
          好きなイベントはヤフーアドベントカレンダーです。 \
          好きな食べ物はケンタッキーフライドチキンです。 \
          趣味はみんなにプレゼントを配ることです。 \
          得意なプログラミングな言語はパイソンです。 "


question_list = ["昨日はどこへ出かけましたか？",
                 "あなたの名前は何ですか？",
                 "あなたの趣味は何ですか？",
                 "あなたが好きなイベントは何ですか？",
                 "あなたが得意プログラミング言語は何ですか？",
                 ]

question = question_list[4]

# モデルとトークナイザーの準備
bert_model = AutoModelForQuestionAnswering.from_pretrained('/content/drive/MyDrive/talking_robot/transformers/output/')  # Fine Tuningしたモデルpathを指定
tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking') 

# 推論の実行
inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt")
input_ids = inputs["input_ids"].tolist()[0]
output = bert_model(**inputs)
answer_start = torch.argmax(output.start_logits)  
answer_end = torch.argmax(output.end_logits) + 1 
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

# 結果出力
print("質問: "+question)
print(f"応答: {answer}です")

In [None]:
#@title Talking Robot
import glob
import librosa
import IPython
import MeCab
import unidic
import pandas as pd
import alkana
import re
import os

%cd /content/voicevox_core

# ローカルPCマイクでの録音設定
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode

RECORD = """
const sleep = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(sec, filename='audio.wav'):
  display(Javascript(RECORD))
  s = output.eval_js('record(%d)' % (sec * 1000))
  b = b64decode(s.split(',')[1])
  with open(filename, 'wb+') as f:
    f.write(b)


audiofile = "audio.wav"
second = 6 #@param {type:"number"}
print(f"Speak to your microphone {second} sec...")
record(second, audiofile)
print("Done!")

result = model.transcribe("audio.wav", verbose=False, language="ja")
print(f'{result["text"]}（英カナ変換前）')

###英単語→カタカナ変換

#半角英字判定
alphaReg = re.compile(r'^[a-zA-Z]+$')
def isalpha(s):
    return alphaReg.match(s) is not None

sample_txt = result["text"]

wakati = MeCab.Tagger('-Owakati')
wakati_result = wakati.parse(sample_txt)
#print(wakati_result)

df = pd.DataFrame(wakati_result.split(" "),columns=["word"])
df = df[df["word"].str.isalpha() == True]
df["english_word"] = df["word"].apply(isalpha)
df = df[df["english_word"] == True]
df["katakana"] = df["word"].apply(alkana.get_kana)

dict_rep = dict(zip(df["word"], df["katakana"]))

for word, read in dict_rep.items():
    sample_txt = sample_txt.replace(word, read)

sample_txt = sample_txt.replace(" ", "")

#半角記号削除
#symbolReg = re.compile('[!"#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`{|}~「」〔〕“”〈〉『』【】＆＊・（）＄＃＠。、？！｀＋￥％]')
#sample_txt = symbolReg.sub(' ', sample_txt)

print(f'{sample_txt}（英カナ変換後）')

# 推論の実行
inputs = tokenizer.encode_plus(sample_txt, context, add_special_tokens=True, return_tensors="pt")
input_ids = inputs["input_ids"].tolist()[0]
output = bert_model(**inputs)
answer_start = torch.argmax(output.start_logits)  
answer_end = torch.argmax(output.end_logits) + 1 
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
answer = answer + "です。"

speacker_id = 0

# make answer sound
!python /content/voicevox_core/example/python/run.py \
    --text {answer} \
    --speaker_id {speacker_id} \
    --f0_speaker_id 0 \
    --f0_correct 0 \
    --root_dir_path="/content/voicevox_core/release" \
    --use_gpu

print(answer)

def sound():
  audio_path = librosa.util.example_audio_file()
  y_full, sr_full = librosa.load(f"/content/voicevox_core/{answer}-{speacker_id}.wav")
  return IPython.display.Audio(data = y_full, rate=sr_full, autoplay = True)

sound()

In [None]:
# GPUの停止
from google.colab import runtime
runtime.unassign()