## Abstract에서 Keyphrase 추출
- 가상환경 sait
- extractor(): 가장 긴 abstract(abstracts[423])에 대해서도 끝부분까지 keyphrase 잘 추출하는 것 확인함

In [1]:
import pickle
from tqdm.notebook import tqdm

with open('Preprocessed_Data/H01L_2020-2022_9585_abstract.pickle', 'rb') as fr:
    abstracts = pickle.load(fr)

In [2]:
len(abstracts)

9585

In [3]:
abstracts[0]

'The wafer polishing system is disclosed. The wafer polishing system may comprise a polishing unit; a slurry distribution unit mounted on the polishing unit and distributing a slurry flowing into the polishing unit for wafer polishing; a slurry tank connected to the slurry distribution unit and storing the slurry; a slurry pump connected to the polishing unit and the slurry tank for transferring the slurry from the slurry tank to the polishing unit; a first circulation line in which one side is connected to the slurry tank; a second circulation line in which one side is connected to the other side of the first circulation line and the other side is connected to the slurry distribution unit; and a cleaning liquid supply unit connected to the second circulation line for supplying a cleaning liquid flowing through the second circulation line.'

In [4]:
from transformers import (
    TokenClassificationPipeline,
    AutoModelForTokenClassification,
    AutoTokenizer,
)
from transformers.pipelines import AggregationStrategy
import numpy as np

# Define keyphrase extraction pipeline
class KeyphraseExtractionPipeline(TokenClassificationPipeline):
    def __init__(self, model, *args, **kwargs):
        super().__init__(
            model=AutoModelForTokenClassification.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model),
            *args,
            **kwargs
        )

    def postprocess(self, all_outputs):
        results = super().postprocess(
            all_outputs=all_outputs,
            aggregation_strategy=AggregationStrategy.SIMPLE,
        )
        keyphrases = [result.get("word").strip() for result in results] # 같은 keyphrase도 중복해서 추출

        return results, keyphrases

In [5]:
# CUDA 연결 안 됨, GPU 사용 x
# Load pipeline
model_name = "ml6team/keyphrase-extraction-kbir-inspec"
extractor = KeyphraseExtractionPipeline(model=model_name)

Using /home/hjkim/.cache/torch_extensions/py39_cu117 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Failed to load CUDA kernels. Mra requires custom CUDA kernels. Please verify that compatible versions of PyTorch and CUDA Toolkit are installed: CUDA_HOME environment variable is not set. Please set it to your CUDA install root.


In [6]:
%%time
result = []
list_keyphrases = []

for abstract in tqdm(abstracts):
    patent = {}
    patent['abstract'] = abstract
    patent['keyphrases'] =[]
    
    raw, words = extractor(abstract)
    list_keyphrases.extend(words)
    
    for r in raw:
        keyphrase = {}
        keyphrase['keyphrase'] = r['word'].strip()
        keyphrase['start_index'] = r['start']
        keyphrase['end_index'] = r['end']
        patent['keyphrases'].append(keyphrase)
    result.append(patent)

  0%|          | 0/9585 [00:00<?, ?it/s]

CPU times: user 6h 54min 53s, sys: 15.3 s, total: 6h 55min 8s
Wall time: 52min 8s


In [14]:
print(len(result))
print(len(list_keyphrases))

9585
99169


In [15]:
from collections import Counter

list_keyphrases = Counter(list_keyphrases).most_common()
print(len(list_keyphrases))

24995


In [20]:
result[0]

{'abstract': 'The wafer polishing system is disclosed. The wafer polishing system may comprise a polishing unit; a slurry distribution unit mounted on the polishing unit and distributing a slurry flowing into the polishing unit for wafer polishing; a slurry tank connected to the slurry distribution unit and storing the slurry; a slurry pump connected to the polishing unit and the slurry tank for transferring the slurry from the slurry tank to the polishing unit; a first circulation line in which one side is connected to the slurry tank; a second circulation line in which one side is connected to the other side of the first circulation line and the other side is connected to the slurry distribution unit; and a cleaning liquid supply unit connected to the second circulation line for supplying a cleaning liquid flowing through the second circulation line.',
 'keyphrases': [{'keyphrase': 'wafer polishing system',
   'start_index': 4,
   'end_index': 26},
  {'keyphrase': 'wafer polishing sy

In [21]:
list_keyphrases[:30]

[('semiconductor device', 1518),
 ('semiconductor substrate', 1218),
 ('substrate', 854),
 ('dielectric layer', 654),
 ('gate structure', 530),
 ('first', 495),
 ('semiconductor structure', 493),
 ('semiconductor wafer', 488),
 ('layer', 474),
 ('top surface', 456),
 ('semiconductor layer', 428),
 ('second', 415),
 ('process chamber', 407),
 ('substrate processing apparatus', 380),
 ('processing chamber', 368),
 ('fin structure', 359),
 ('gate electrode', 341),
 ('embodiment', 295),
 ('semiconductor fin', 280),
 ('gate stack', 276),
 ('plasma', 260),
 ('insulating layer', 257),
 ('semiconductor material', 246),
 ('conductive layer', 246),
 ('channel region', 240),
 ('conductive material', 227),
 ('electrostatic chuck', 226),
 ('gate dielectric layer', 213),
 ('dielectric material', 209),
 ('semiconductor devices', 173)]

In [22]:
# pickle 파일로 저장
with open('Preprocessed_Data/H01L_2020-2022_9585_abstract_keyphrases.pickle','wb') as fw:
    pickle.dump(result, fw)
with open('Preprocessed_Data/H01L_2020-2022_9585_abstract_keyphrases_list.pickle','wb') as fw:
    pickle.dump(list_keyphrases, fw)

## 결과 불러오기
- keyphrases : [abstract 1, abstract 2, ... ]인 리스트
  - abstract n : key가 'abstract', 'keyphrases'인 딕셔너리
    - abstract: abstract 텍스트
    - keyphrases: [keyphrase 1, keyphrase 2, ... ]인 리스트
      - keyphrase n: key가 'keyphrase', 'start_index', 'end_index'인 딕셔너리
        - keyphrase: keyphrase 텍스트
        - start_index: abstract에서 해당 keyphrase가 시작하는 index
        - end_index: abstract에서 해당 keyphrase가 끝나는 index
        
- keyphrases_list: keyphrase만 모아서 빈도순으로 나열한 리스트

In [23]:
# pickle 파일 불러오기
with open('Preprocessed_Data/H01L_2020-2022_9585_abstract_keyphrases.pickle', 'rb') as fr:
    keyphrases = pickle.load(fr)
with open('Preprocessed_Data/H01L_2020-2022_9585_abstract_keyphrases_list.pickle', 'rb') as fr:
    keyphrases_list = pickle.load(fr)

In [26]:
print(len(keyphrases)) # 전체 특허 수와 동일
print(len(keyphrases_list)) # unique한 keyphrase의 수

9585
24995


In [33]:
# Ex.
print(keyphrases[0]['abstract']) # 첫번째 특허의 abstract
print()
print(keyphrases[0]['keyphrases'][0]) # 첫번째 특허의 첫번째 keyphrase

The wafer polishing system is disclosed. The wafer polishing system may comprise a polishing unit; a slurry distribution unit mounted on the polishing unit and distributing a slurry flowing into the polishing unit for wafer polishing; a slurry tank connected to the slurry distribution unit and storing the slurry; a slurry pump connected to the polishing unit and the slurry tank for transferring the slurry from the slurry tank to the polishing unit; a first circulation line in which one side is connected to the slurry tank; a second circulation line in which one side is connected to the other side of the first circulation line and the other side is connected to the slurry distribution unit; and a cleaning liquid supply unit connected to the second circulation line for supplying a cleaning liquid flowing through the second circulation line.

{'keyphrase': 'wafer polishing system', 'start_index': 4, 'end_index': 26}


In [35]:
# Ex.
start = keyphrases[0]['keyphrases'][0]['start_index']
end = keyphrases[0]['keyphrases'][0]['end_index']

print(keyphrases[0]['keyphrases'][0]['keyphrase'])
print(keyphrases[0]['abstract'][start:end]) # index 이용해서 abstract에서 indexing

wafer polishing system
wafer polishing system
