# pdfの英語文献から，出現頻度順の日訳英単語リストを作成するプログラム
## INPUT : PDF 英語文献     
## OUTPUT : CSV 翻訳 word list

### 品詞の略については以下サイトを参照
[Qiita](https://qiita.com/m__k/items/ffd3b7774f2fde1083fa "NLTKの使い方をいろいろ調べてみた")

### 初期設定
#### 最初の一回のみ，実行する．その後はコメントアウトしてよい．

In [1]:
# # 使用するライブラリのインストール
# !pip install PyPDF2
# !pip install nltk
# !pip install deep_translator
# !pip install tqdm

# # nltkライブラリの事前ダウンロードファイル
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

In [2]:
import PyPDF2

# INPUT: pdf_file に 読み込むパスを記述！！！

In [3]:
# pdf 読み込み，textに変換．
pdf_file = "../../../RNA Viral Community in Human Feces.pdf"
with open(pdf_file, "rb") as f:
    reader = PyPDF2.PdfFileReader(f)
    num_page = reader.getNumPages()
    pages = []
    for i in range(num_page):
        pages.append(reader.getPage(i).extractText())


### 単語の整理: 単語群をもっと整えたい場合は，以下を編集するとよいかも
### ほしい品詞なども設定できる

In [4]:
import os
import nltk
import numpy as np
import pandas as pd
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')

word_lists = []
for i in range(num_page):
    word_list = nltk.word_tokenize(pages[i])
    word_lists.extend(word_list)

# いらない単語を削除
removed_word_list = []
for word in word_lists:
    if word.isalpha(): # アルファベットのみ使用
        if len(word) > 2: # 2文字より大きい文字列のみ使用
            removed_word_list.append(word)
            
            
# basic word list を読み込み，listに含まれる単語を削除する．
basic_words_total_df = pd.read_csv(os.getcwd() + '/basicwordlist/english-word-list-total.csv', header=3, sep=';')
basic_words_nouns_df = pd.read_csv(os.getcwd() + '/basicwordlist/english-word-list-nouns.csv', header=3, sep=';').rename(columns={'noun': 'word'})
basic_words_verbs_df = pd.read_csv(os.getcwd() + '/basicwordlist/english-word-list-verbs.csv', header=3, sep=';').rename(columns={'verb': 'word'})
basic_words_adjectives_df = pd.read_csv(os.getcwd() + '/basicwordlist/english-word-list-adjectives.csv', header=3, sep=';').rename(columns={'adjective': 'word'})
most_common_words_df = pd.read_csv(os.getcwd() + '/basicwordlist/most-common-words-1k.csv', header=0).rename(columns={'Word': 'word'})

basic_words_df = pd.concat([basic_words_total_df, basic_words_nouns_df, basic_words_verbs_df, basic_words_adjectives_df, most_common_words_df], join='inner')

basic_words_df = basic_words_df.dropna(subset=['word']).reset_index()
basic_words = basic_words_df['word'].to_list()
#print(basic_words_df)

not_basic_word_list = []
for word in removed_word_list:
    if word.lower() not in basic_words:
        not_basic_word_list.append(word)


In [5]:
word_dict = {} # 重複回数をカウント
for word in not_basic_word_list:
    if not word in word_dict:
        word_dict[word] = 1
    else:
        word_dict[word] += 1
                

sorted_dict = sorted(word_dict.items(), key=lambda x:x[1], reverse=True)

# 品詞の取得
pos = nltk.pos_tag([pair[0] for pair in sorted_dict])

print(len(pos))
# 欲しい品詞のポジションを取得
p = []
for i, pair in enumerate(pos):
    po = pair[1]
    if any(x in po for x in ["FW", "JJ", "LS", "MD", "NN", "RB", "RP", "VB"]):
        p.append(i)

sorted_dict = np.array(sorted_dict)[p]
pos = np.array(pos)[p]

print(len(p))
print(sorted_dict.shape)
print(pos.shape)
print((pos))
print(sorted_dict)

1450
1444
(1444, 2)
(1444, 2)
[['PMMV' 'NNP']
 ['virus' 'NN']
 ['viruses' 'NNS']
 ...
 ['Comparison' 'NNP']
 ['naturally' 'RB']
 ['blender' 'VB']]
[['PMMV' '108']
 ['virus' '103']
 ['viruses' '84']
 ...
 ['Comparison' '1']
 ['naturally' '1']
 ['blender' '1']]


### 英日訳 1秒間隔で英日訳を取得
- 単語数が少ない場合は，sleepを消してもよい．
- 単語数が多く，sleepが短いと，接続がキャンセルされる．

In [6]:
from http.client import RemoteDisconnected
from deep_translator import GoogleTranslator
from tqdm.notebook import tqdm
from time import sleep

translator = GoogleTranslator(sourse='en', target='ja')

engs = []
for i, word in tqdm(enumerate(sorted_dict), total=len(sorted_dict)):
    try:
        translated = translator.translate(word[0])
        engs.append(translated)
        print("{}\t{}\t{}\t{}\t{}".format(sorted_dict[i][1], sorted_dict[i][0], pos[i][0], pos[i][1], translated))
        sleep(1)
    except:
        print(f"Error!!!\ti: {i}\tword: {word}")
        sleep(3)
        try:
            translated = translator.translate(word[0])
            engs.append(translated)
            print("{}\t{}\t{}\t{}\t{}".format(sorted_dict[i][1], sorted_dict[i][0], pos[i][0], pos[i][1], translated))
            sleep(1)
        except:
            print(f"Error!!!!!! 2times\ti: {i}\tword: {word}")
    

  0%|          | 0/1444 [00:00<?, ?it/s]

108	PMMV	PMMV	NNP	PMMV
103	virus	virus	NN	ウイルス
84	viruses	viruses	NNS	ウイルス
63	RNA	RNA	NNP	RNA
61	viral	viral	JJ	ウイルス性
50	fecal	fecal	JJ	糞便
49	sequences	sequences	NNS	シーケンス
36	plant	plant	NN	工場
35	samples	samples	NNS	サンプル
35	Lib	Lib	NNP	リブ
33	feces	feces	NNS	糞便
29	mosaic	mosaic	VBP	モザイク
27	lane	lane	JJ	LANE
24	libraries	libraries	NNS	ライブラリ
23	sequence	sequence	NN	順序
21	Viruses	Viruses	NNP	ウイルス
19	Plant	Plant	NNP	工場
18	PBV	PBV	NNP	PBV
18	genome	genome	VBD	ゲノム
16	mottle	mottle	JJ	まだら
16	sample	sample	JJ	サンプル
15	individuals	individuals	NNS	個人
14	Feces	Feces	NNS	糞便
14	Singapore	Singapore	NNP	シンガポール
13	sequencing	sequencing	VBG	シーケンシング
13	DNA	DNA	NNP	DNA
12	Biology	Biology	NNP	生物学
12	PLoS	PLoS	NNP	PLOS
12	library	library	JJ	図書館
12	segments	segments	NNS	セグメント
11	gastroenteritis	gastroenteritis	VBP	胃腸炎
11	collected	collected	VBN	集めました
11	plants	plants	NNS	植物
11	DOI	DOI	NNP	土肥
11	January	January	NNP	1月
11	Volume	Volume	NN	音量
11	clones	clones	NNS	クローン
11	PCR	PCR	NNP	PCR
10	San	San	NNP	さん
10	Dieg

### 必要な配列をマージしておく 

In [None]:
import pandas as pd

counts = [i[1] for i in sorted_dict]
words = [i[0] for i in sorted_dict]
poss = [i[1] for i in pos]

result_df = pd.DataFrame(list(zip(counts, words, poss, engs)), columns=["出現回数", "英語", "品詞", "日訳"]) 

print(result_df)

### 翻訳前と翻訳後が同じであるものは削除

In [None]:
print(result_df.shape)

result_df = result_df[~(result_df["英語"].lower()==result_df["日訳"].lower())]

print(result_df.shape)
    

## OUTPUT: save_fileに保存先パスを記述！！！

In [None]:
save_file = os.path.splitext(pdf_file)[0] + "_word_list" + ".csv"

result_df.to_csv(save_file, encoding='utf-8-sig', header=True, index=False)