<a href="https://colab.research.google.com/github/kooose38/systematic-review-work-sigmate/blob/dev/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 前処理をそれぞれのデータ型にそろえて用意する
1. カウンターベクトル化して次元削減する -> sckit-learnの分類モデル
1. トークンベクトル -> 自然言語処理モデル 
  
negativeデータが多いのでメモリの関係上削減する。  


### load dataset 

In [None]:
!git clone https://github.com/kooose38/pytools_table

fatal: destination path 'pytools_table' already exists and is not an empty directory.


In [None]:
%cd pytools_table/

/content/pytools_table


In [None]:
import pandas as pd 
df_ = pd.read_csv("train_prep.csv")
df_.sort_values(by="judgement", ascending=False)

Unnamed: 0,abstract,judgement,abstract_isna
0,background: evaluate efficacy safety galantami...,0,0
1,wuhan covid-19 intubation experience,0,0
2,patients coronavirus disease 2019 (covid-19) h...,0,0
3,protein aggregation underlies wide range human...,0,0
4,objective: investigate application severity cl...,0,0
...,...,...,...
28160,objectives: sars-cov-2 infection diagnosis cha...,1,0
28161,"background march 2020, saudi ministry health i...",0,0
28162,study models local cross-city transmissions no...,0,0
28163,considerations resuscitation transfer paediatr...,0,0


### count-vector preprocess 
1. テキストがnanかどうか
1. 学習データの削減(メモリ不足のため)
1. クレンジング処理 
1. 形態素解析の品詞指定
1. tf-idfによるvector変換
1. pcaによる500次元削減
1. npyで保存

In [None]:
from typing import List, Dict, Any, Union
def get_token(df) -> List[str]:
    dataset = []
    for doc in df.abstract.to_list():
        data = []
        for text in doc.strip().split(" "):
            text = text.replace("(", "")
            text = text.replace(")", "")
            text = text.replace(":", "")
            text = text.replace("<", "")
            text = text.replace(">", "")
            text = text.replace(",", "")
            text = text.replace(".", "")
            if text == "": continue
            data.append(text)
        dataset.append(" ".join(data))
    return dataset 

prep_data = get_token(df)

df_["abstract_prep"] = prep_data


In [None]:
!pip install -q nltk

In [None]:
import nltk
nltk.download("all")

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk import stem 
lemmatizer = WordNetLemmatizer()
stemmer = stem.PorterStemmer()
doc_list = []

for doc in df_.abstract_prep.to_list():
    morph = nltk.word_tokenize(doc)
    poses = nltk.pos_tag(morph)
    word_list = []
    for word, pos in poses:
        if pos in ["DT", "NN", "JJ", "FW", "NNS", "NNP", "NNPS", "PDT", "PRP", "VBG", "VBD", "VBN", "VBP", "VBZ"]:
            word = word.lower()
            word = stemmer.stem(word)
            word_list.append(word)
    doc_list.append(word_list)
    


In [None]:
df = pd.DataFrame([" ".join(d) for d in doc_list])
df.columns = ["abstract"]
df["judgement"] = df_.judgement
df["is_nan"] = df_.abstract_isna 
df = df.sort_values(by="judgement", ascending=False).iloc[:5600, :]
df.to_csv("prep_text.csv", index=False)
df.head()

Unnamed: 0,abstract,judgement,is_nan
0,background present covid-19 overlap common inf...,1,0
1,valid specimen-pool stategi real-tim revers tr...,1,0
2,nucleic acid amplif detect sars-cov-2 rna resp...,1,0
3,bodi temperatur screen sars-cov-2 infect young...,1,0
4,diagnost accuraci cerebrospin fluid amyloid-be...,1,0
...,...,...,...
12160,object studi investig clinic imag characterist...,0,0
12161,it common find substanti alzheim diseas ad les...,0,0
12162,tau amyloid pathobiolog process underli alzhei...,0,0
12163,coronaviru diseas covid-19 remain major sourc ...,0,0


In [None]:
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer 

def craate_tf(df):
    prep_list = []
    for doc in df.abstract.to_list():
        prep_ = []
        for do in doc.split(" "):
            prep_.append(do)
        prep_list.append(prep_)

    def tokenize_id(sentence):
        return sentence
    tfidf = TfidfVectorizer(lowercase=False, tokenizer=tokenize_id)
    vector = tfidf.fit_transform(prep_list).toarray()

    print(vector[2])

    tf = pd.DataFrame(vector, columns=tfidf.vocabulary_)
    tf["labels"] = df.judgement
    tf["isnan"] = df.is_nan  

    x_train, x_test, y_train, y_test = train_test_split(tf.drop(["labels"], axis=1), tf[["labels"]], 
                                                        test_size=.2, stratify=tf.labels)
    return x_train, x_test, y_train, y_test 

x_train, x_test, y_train, y_test = create_tf(df)


[0.09520824 0.         0.         ... 0.         0.         0.        ]


In [None]:
from utils.reduction import PCA_

pca = PCA_()
pca.fit(x_train, y_train, is_plot=True)

array([0.01424029, 0.0088624 ])

In [None]:
ratio = pca.fit(x_train, y_train, n_components=500) # 1000次元では説明力60%超えた。
x_train_pca = pca.transform(x_train)
x_test_pcs = pca.transform(x_test)
ratio.cumsum()[-1]

0.46356683346143474

In [None]:
x_train_pca.shape

(5600, 500)

In [57]:
import numpy as np 

def save_np(df, filename):
    a = np.array(df)
    np.save(filename, a)

save_np(x_train_pca, "x_train_pca")
save_np(x_test_pcs, "x_test_pca")
save_np(y_train, "y_train_pca")
save_np(y_test, "y_test_pca")

### tokeinzer preprocess 
1. クレンジング処理
1. 品詞指定
1. bertによるトークン化。なお頻出トークンを上位`100`個追加すること。`new-vocab.pkl`に保存してます。input_idsの次元数は`256`で固定。

In [None]:
!pip install -q transformers 

[K     |████████████████████████████████| 2.6 MB 7.0 MB/s 
[K     |████████████████████████████████| 636 kB 56.7 MB/s 
[K     |████████████████████████████████| 895 kB 31.0 MB/s 
[K     |████████████████████████████████| 3.3 MB 42.2 MB/s 
[?25h

In [21]:
from transformers import BertTokenizer 
tokenizer = BertTokenizer(vocab_file="en-vocab-bert.txt", do_lower_case=False)

In [33]:
from typing import Dict 
import pickle 

def get_many_token(df: pd.DataFrame) -> Dict[str, int]:
    '''頻出するトークン上位１００を取得してbertのボキャブラリーに加える'''
    many_words = {}
    for doc in df.abstract.to_list():
        words = doc.split(" ")
        for word in words:
            if word not in many_words:
                many_words[word] = 1
            else:
                many_words[word] += 1  

    many_list, num_list = [], []
    for k, v in many_words.items():
        many_list.append(k)
        num_list.append(v)
    sample = pd.DataFrame({"word": many_list, "num": num_list})
    sample = sample.sort_values(by="num", ascending=False)
    sample = sample.iloc[:100, :]

    many = {}
    for k, v in zip(sample.word.to_list(), sample.num.to_list()):
        many[k] = v 
    return many 

def save_vocab(vocab: Dict[str, int]):
    with open("new-vocab.pkl", "wb") as f:
        pickle.dump(vocab, f)

new_vocab = get_many_token(df)
save_vocab(new_vocab)

In [30]:
tokenizer.add_tokens([k for k, v in new_vocab.items()], special_tokens=True)

38

In [36]:
max_len = 0 
for doc in df.abstract.to_list():
    if max_len < len(doc):
        max_len = len(doc)
print(max_len)

5385


In [59]:
import torch 
from typing import Dict, List 
def create_token(df) -> List[dict]:
    '''input_ids/attentin_mask/token_type_ids/labelsのテンソルを作成'''
    dataset = []
    for doc, label in zip(df.abstract.to_list(), df.judgement.to_list()):
        seq = " ".join(doc)
        encoding = tokenizer(seq, truncation=True, max_length=256, padding="max_length", 
                             return_tensors="pt")
        encoding["labels"] = label 
        encoding = {k: torch.tensor(v, dtype=torch.long) for k, v in encoding.items()}
        dataset.append(encoding)
    return dataset 

token_data = create_token(df)
token_data[3]["input_ids"].size()


To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).



torch.Size([1, 256])

In [52]:
import pickle 
with open("tokenize.pkl", "wb") as f:
    pickle.dump(token_data, f)