<a href="https://colab.research.google.com/github/hiroto-noguchi/weekly_articles_2023/blob/main/weekly_article_2023_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#URLを使用した校正
import requests
import json

url = 'https://api.languagetool.org/v2/check'
text = 'This are the correct sentences.'
params = {
    'text': text,
    'language': 'en-US'
}
response = requests.post(url, data=params)
result = json.loads(response.text)
result

In [None]:
#ライブラリーを使用した校正
!pip install language-tool-python
from language_tool_python import LanguageTool

text = "This are the correct sentences."
tool = LanguageTool('en-US')
matches = tool.check(text)
matches

In [None]:
for error in matches:
  print(error)

In [None]:
import requests

def translate(text):
    endpoint = "https://api.mymemory.translated.net/get"
    params = {
        "q": text,
        "langpair": "en|ja"
    }
    response = requests.get(endpoint, params=params)
    data = response.json()
    return data

text = "This are the correct sentences."
translation = translate(text)
print(translation['responseData']['translatedText'])

In [None]:
for error in matches:
  translation = translate(error.message)
  print(translation['responseData']['translatedText'])

In [None]:
!git clone https://github.com/odashi/small_parallel_enja.git

with open('/content/small_parallel_enja/train.en.000', 'r', encoding='utf-8') as f:
    english_sentences_train = f.readlines()

with open('/content/small_parallel_enja/train.ja.000', 'r', encoding='utf-8') as f:
    japanese_sentences_train = f.readlines()

with open('/content/small_parallel_enja/test.en', 'r', encoding='utf-8') as f:
    english_sentences_test = f.readlines()

with open('/content/small_parallel_enja/test.ja', 'r', encoding='utf-8') as f:
    japanese_sentences_test = f.readlines()

In [None]:
japanese_sentences_test[:10]

In [None]:
english_sentences_test[:10]

In [None]:
import tensorflow as tf
import numpy as np

# データを変数に与える
japanese_sentences = japanese_sentences_train
english_sentences = english_sentences_train

# 日本語文を整数に変換する
japanese_tokenizer = tf.keras.preprocessing.text.Tokenizer()
japanese_tokenizer.fit_on_texts(japanese_sentences)

# 英語文を整数に変換する
english_tokenizer = tf.keras.preprocessing.text.Tokenizer()
english_tokenizer.fit_on_texts(english_sentences)

# 整数を単語に変換するための逆引き辞書を作成する
japanese_index_word = {index: word for word, index in japanese_tokenizer.word_index.items()}
english_index_word = {index: word for word, index in english_tokenizer.word_index.items()}

# 日本語文を整数に変換する
japanese_sequences = japanese_tokenizer.texts_to_sequences(japanese_sentences)

# 英語文を整数に変換する
english_sequences = english_tokenizer.texts_to_sequences(english_sentences)

# パディング(0で埋めることによる文の長さの調整)する
japanese_padded = tf.keras.preprocessing.sequence.pad_sequences(japanese_sequences, padding='post')
english_padded = tf.keras.preprocessing.sequence.pad_sequences(english_sequences, padding='post')

# モデルを定義する
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(len(japanese_tokenizer.word_index)+1, 256),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256)),
    tf.keras.layers.RepeatVector(len(english_padded[0])),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True)),
    tf.keras.layers.Dense(len(english_tokenizer.word_index)+1, activation='softmax')
])

# モデルをコンパイルする
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

# モデルを訓練する
model.fit(japanese_padded, english_padded, epochs=30)

In [None]:
# 翻訳したい日本語の文をリストで変数に与える
input_sentences = japanese_sentences_test[:20]

# 日本語の文を整数に変換する
input_sequences = japanese_tokenizer.texts_to_sequences(input_sentences)

# パディングする
input_padded = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, padding='post', maxlen=len(japanese_padded[0]))

# 翻訳する
output_sequences = model.predict(input_padded)

# 翻訳された英語の文を単語に変換する
output_sentences = []
for seq in output_sequences:
    output_sequence = np.argmax(seq, axis=-1)
    output_sentence = []
    for i in output_sequence:
        if i == 0:
            break
        if i in english_index_word:
            output_sentence.append(english_index_word[i])
    output_sentence = ' '.join(output_sentence)
    output_sentences.append(output_sentence)

# 翻訳結果を表示する
for i, sentence in enumerate(input_sentences):
    print("日本語{}：{}".format(i+1, sentence))
    print("英語{}：{}".format(i+1, output_sentences[i]))
    print()